✨ feat: support nano banana pro (#10413)

* fix nanobanana * add types * 完成 fetch sse 和 google ai 侧转换 * thinking * ui for part render * support image in thinking * fix issue * support convert content part * support nano banana pro image generation * fix tests * fix tests
2026-03-27 13:29:15 +07:00 · 2025-11-26 00:16:44 +08:00
parent b78f24c67f
commit a93cfcd703
21 changed files with 1558 additions and 177 deletions
--- a/packages/context-engine/src/processors/MessageContent.ts
+++ b/packages/context-engine/src/processors/MessageContent.ts
@@ -1,4 +1,5 @@
 import { filesPrompts } from '@lobechat/prompts';
+import { MessageContentPart } from '@lobechat/types';
 import { imageUrlToBase64 } from '@lobechat/utils/imageToBase64';
 import { parseDataUri } from '@lobechat/utils/uriParser';
 import { isDesktopLocalStaticServerUrl } from '@lobechat/utils/url';
@@ -9,6 +10,23 @@ import type { PipelineContext, ProcessorOptions } from '../types';

 const log = debug('context-engine:processor:MessageContentProcessor');

+/**
+ * Deserialize content string to message content parts
+ * Returns null if content is not valid JSON array of parts
+ */
+const deserializeParts = (content: string): MessageContentPart[] | null => {
+  try {
+    const parsed = JSON.parse(content);
+    // Validate it's an array with valid part structure
+    if (Array.isArray(parsed) && parsed.length > 0 && parsed[0]?.type) {
+      return parsed as MessageContentPart[];
+    }
+  } catch {
+    // Not JSON, treat as plain text
+  }
+  return null;
+};
+
 export interface FileContextConfig {
  /** Whether to enable file context injection */
  enabled?: boolean;
@@ -30,6 +48,7 @@ export interface MessageContentConfig {
 }

 export interface UserMessageContentPart {
+  googleThoughtSignature?: string;
  image_url?: {
    detail?: string;
    url: string;
@@ -213,7 +232,7 @@ export class MessageContentProcessor extends BaseProcessor {
   * Process assistant message content
   */
  private async processAssistantMessage(message: any): Promise<any> {
-    // Check if there is reasoning content (thinking mode)
+    // Priority 1: Check if there is reasoning content with signature (thinking mode)
    const shouldIncludeThinking = message.reasoning && !!message.reasoning?.signature;

    if (shouldIncludeThinking) {
@@ -235,7 +254,59 @@ export class MessageContentProcessor extends BaseProcessor {
      };
    }

-    // Check if there are images (assistant messages may also contain images)
+    // Priority 2: Check if reasoning content is multimodal
+    const hasMultimodalReasoning = message.reasoning?.isMultimodal && message.reasoning?.content;
+
+    if (hasMultimodalReasoning) {
+      const reasoningParts = deserializeParts(message.reasoning.content);
+      if (reasoningParts) {
+        // Convert reasoning multimodal parts to plain text
+        const reasoningText = reasoningParts
+          .map((part) => {
+            if (part.type === 'text') return part.text;
+            if (part.type === 'image') return `[Image: ${part.image}]`;
+            return '';
+          })
+          .join('\n');
+
+        // Update reasoning to plain text
+        const updatedMessage = {
+          ...message,
+          reasoning: {
+            ...message.reasoning,
+            content: reasoningText,
+            isMultimodal: false, // Convert to non-multimodal
+          },
+        };
+
+        // Handle main content based on whether it's multimodal
+        if (message.metadata?.isMultimodal && message.content) {
+          const contentParts = deserializeParts(message.content);
+          if (contentParts) {
+            const convertedParts = this.convertMessagePartsToContentParts(contentParts);
+            return {
+              ...updatedMessage,
+              content: convertedParts,
+            };
+          }
+        }
+
+        return updatedMessage;
+      }
+    }
+
+    // Priority 3: Check if message content is multimodal
+    const hasMultimodalContent = message.metadata?.isMultimodal && message.content;
+
+    if (hasMultimodalContent) {
+      const parts = deserializeParts(message.content);
+      if (parts) {
+        const contentParts = this.convertMessagePartsToContentParts(parts);
+        return { ...message, content: contentParts };
+      }
+    }
+
+    // Priority 4: Check if there are images (legacy imageList field)
    const hasImages = message.imageList && message.imageList.length > 0;

    if (hasImages && this.config.isCanUseVision?.(this.config.model, this.config.provider)) {
@@ -253,10 +324,7 @@ export class MessageContentProcessor extends BaseProcessor {
      const imageContentParts = await this.processImageList(message.imageList || []);
      contentParts.push(...imageContentParts);

-      return {
-        ...message,
-        content: contentParts,
-      };
+      return { ...message, content: contentParts };
    }

    // Regular assistant message, return plain text content
@@ -266,6 +334,32 @@ export class MessageContentProcessor extends BaseProcessor {
    };
  }

+  /**
+   * Convert MessageContentPart[] (internal format) to OpenAI-compatible UserMessageContentPart[]
+   */
+  private convertMessagePartsToContentParts(parts: MessageContentPart[]): UserMessageContentPart[] {
+    const contentParts: UserMessageContentPart[] = [];
+
+    for (const part of parts) {
+      if (part.type === 'text') {
+        contentParts.push({
+          googleThoughtSignature: part.thoughtSignature,
+          text: part.text,
+          type: 'text',
+        });
+      } else if (part.type === 'image') {
+        // Images are already in S3 URL format, no conversion needed
+        contentParts.push({
+          googleThoughtSignature: part.thoughtSignature,
+          image_url: { detail: 'auto', url: part.image },
+          type: 'image_url',
+        });
+      }
+    }
+
+    return contentParts;
+  }
+
  /**
   * Process image list
   */
--- a/packages/context-engine/src/processors/tests/MessageContent.test.ts
+++ b/packages/context-engine/src/processors/tests/MessageContent.test.ts
@@ -566,4 +566,243 @@ describe('MessageContentProcessor', () => {
      expect(content[2].video_url.url).toBe('http://example.com/video.mp4');
    });
  });
+
+  describe('Multimodal message content processing', () => {
+    it('should convert assistant message with metadata.isMultimodal to OpenAI format', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: JSON.stringify([
+            { type: 'text', text: 'Here is an image:' },
+            { type: 'image', image: 'https://s3.example.com/image.png' },
+            { type: 'text', text: 'What do you think?' },
+          ]),
+          metadata: {
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        content: [
+          { type: 'text', text: 'Here is an image:' },
+          {
+            type: 'image_url',
+            image_url: { detail: 'auto', url: 'https://s3.example.com/image.png' },
+          },
+          { type: 'text', text: 'What do you think?' },
+        ],
+      });
+    });
+
+    it('should convert assistant message with reasoning.isMultimodal to plain text', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: 'The answer is correct.',
+          reasoning: {
+            content: JSON.stringify([
+              { type: 'text', text: 'Let me analyze this image:' },
+              { type: 'image', image: 'https://s3.example.com/reasoning-image.png' },
+              { type: 'text', text: 'Based on the analysis...' },
+            ]),
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        reasoning: {
+          content:
+            'Let me analyze this image:\n[Image: https://s3.example.com/reasoning-image.png]\nBased on the analysis...',
+          isMultimodal: false,
+        },
+        content: 'The answer is correct.',
+      });
+    });
+
+    it('should handle both reasoning.isMultimodal and metadata.isMultimodal', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: JSON.stringify([
+            { type: 'text', text: 'Final result:' },
+            { type: 'image', image: 'https://s3.example.com/result.png' },
+          ]),
+          metadata: {
+            isMultimodal: true,
+          },
+          reasoning: {
+            content: JSON.stringify([
+              { type: 'text', text: 'Thinking about:' },
+              { type: 'image', image: 'https://s3.example.com/thinking.png' },
+            ]),
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        reasoning: {
+          content: 'Thinking about:\n[Image: https://s3.example.com/thinking.png]',
+          isMultimodal: false,
+        },
+        content: [
+          { type: 'text', text: 'Final result:' },
+          {
+            type: 'image_url',
+            image_url: { detail: 'auto', url: 'https://s3.example.com/result.png' },
+          },
+        ],
+      });
+    });
+
+    it('should prioritize reasoning.signature over reasoning.isMultimodal', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: 'The answer.',
+          reasoning: {
+            content: 'Some thinking process',
+            signature: 'sig123',
+            // Even if isMultimodal is true, signature takes priority
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        content: [
+          {
+            type: 'thinking',
+            thinking: 'Some thinking process',
+            signature: 'sig123',
+          },
+          { type: 'text', text: 'The answer.' },
+        ],
+      });
+    });
+
+    it('should handle plain text when isMultimodal is true but content is not valid JSON', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: 'This is plain text, not JSON',
+          metadata: {
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        content: 'This is plain text, not JSON',
+      });
+    });
+
+    it('should preserve thoughtSignature in multimodal content parts', async () => {
+      const processor = new MessageContentProcessor({
+        model: 'gpt-4',
+        provider: 'openai',
+        isCanUseVision: mockIsCanUseVision,
+        fileContext: { enabled: false },
+      });
+
+      const messages: UIChatMessage[] = [
+        {
+          id: 'test',
+          role: 'assistant',
+          content: JSON.stringify([
+            { type: 'text', text: 'Analysis result:', thoughtSignature: 'sig-001' },
+            { type: 'image', image: 'https://s3.example.com/chart.png', thoughtSignature: 'sig-002' },
+            { type: 'text', text: 'Conclusion' },
+          ]),
+          metadata: {
+            isMultimodal: true,
+          },
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+          meta: {},
+        },
+      ];
+
+      const result = await processor.process(createContext(messages));
+
+      expect(result.messages[0]).toMatchObject({
+        content: [
+          { type: 'text', text: 'Analysis result:', googleThoughtSignature: 'sig-001' },
+          {
+            type: 'image_url',
+            image_url: { detail: 'auto', url: 'https://s3.example.com/chart.png' },
+            googleThoughtSignature: 'sig-002',
+          },
+          { type: 'text', text: 'Conclusion' },
+        ],
+      });
+    });
+  });
 });
--- a/packages/fetch-sse/src/fetchSSE.ts
+++ b/packages/fetch-sse/src/fetchSSE.ts
@@ -71,6 +71,22 @@ export interface MessageGroundingChunk {
  type: 'grounding';
 }

+export interface MessageReasoningPartChunk {
+  content: string;
+  mimeType?: string;
+  partType: 'text' | 'image';
+  thoughtSignature?: string;
+  type: 'reasoning_part';
+}
+
+export interface MessageContentPartChunk {
+  content: string;
+  mimeType?: string;
+  partType: 'text' | 'image';
+  thoughtSignature?: string;
+  type: 'content_part';
+}
+
 interface MessageToolCallsChunk {
  isAnimationActives?: boolean[];
  tool_calls: MessageToolCall[];
@@ -87,6 +103,8 @@ export interface FetchSSEOptions {
      | MessageTextChunk
      | MessageToolCallsChunk
      | MessageReasoningChunk
+      | MessageReasoningPartChunk
+      | MessageContentPartChunk
      | MessageGroundingChunk
      | MessageUsageChunk
      | MessageBase64ImageChunk
@@ -420,6 +438,18 @@ export const fetchSSE = async (url: string, options: RequestInit & FetchSSEOptio
          break;
        }

+        case 'reasoning_part':
+        case 'content_part': {
+          options.onMessageHandle?.({
+            content: data.content,
+            mimeType: data.mimeType,
+            partType: data.partType,
+            thoughtSignature: data.thoughtSignature,
+            type: ev.event,
+          });
+          break;
+        }
+
        case 'tool_calls': {
          // get finial
          // if there is no tool calls, we should initialize the tool calls
--- a/packages/model-runtime/src/core/contextBuilders/google.test.ts
+++ b/packages/model-runtime/src/core/contextBuilders/google.test.ts
@@ -33,7 +33,7 @@ describe('google contextBuilders', () => {

      const result = await buildGooglePart(content);

-      expect(result).toEqual({ text: 'Hello' });
+      expect(result).toEqual({ text: 'Hello', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE });
    });

    it('should handle thinking type messages', async () => {
@@ -71,6 +71,7 @@ describe('google contextBuilders', () => {
          data: 'iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==',
          mimeType: 'image/png',
        },
+        thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
      });
    });

@@ -101,6 +102,7 @@ describe('google contextBuilders', () => {
          data: mockBase64,
          mimeType: 'image/png',
        },
+        thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
      });

      expect(imageToBase64Module.imageUrlToBase64).toHaveBeenCalledWith(imageUrl);
@@ -144,6 +146,7 @@ describe('google contextBuilders', () => {
          data: 'mockVideoBase64Data',
          mimeType: 'video/mp4',
        },
+        thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
      });
    });
  });
@@ -158,7 +161,7 @@ describe('google contextBuilders', () => {
      const converted = await buildGoogleMessage(message);

      expect(converted).toEqual({
-        parts: [{ text: 'Hello' }],
+        parts: [{ text: 'Hello', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
        role: 'model',
      });
    });
@@ -172,7 +175,7 @@ describe('google contextBuilders', () => {
      const converted = await buildGoogleMessage(message);

      expect(converted).toEqual({
-        parts: [{ text: 'Hi' }],
+        parts: [{ text: 'Hi', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
        role: 'user',
      });
    });
@@ -196,8 +199,11 @@ describe('google contextBuilders', () => {

      expect(converted).toEqual({
        parts: [
-          { text: 'Check this image:' },
-          { inlineData: { data: '...', mimeType: 'image/png' } },
+          { text: 'Check this image:', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE },
+          {
+            inlineData: { data: '...', mimeType: 'image/png' },
+            thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
+          },
        ],
        role: 'user',
      });
@@ -334,10 +340,18 @@ describe('google contextBuilders', () => {

        expect(contents).toEqual([
          {
-            parts: [{ text: '<plugins>Web Browsing plugin available</plugins>' }],
+            parts: [
+              {
+                text: '<plugins>Web Browsing plugin available</plugins>',
+                thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
+              },
+            ],
+            role: 'user',
+          },
+          {
+            parts: [{ text: '杭州天气如何', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
            role: 'user',
          },
-          { parts: [{ text: '杭州天气如何' }], role: 'user' },
          {
            parts: [
              {
@@ -421,7 +435,7 @@ describe('google contextBuilders', () => {

        expect(contents).toEqual([
          {
-            parts: [{ text: '杭州天气如何' }],
+            parts: [{ text: '杭州天气如何', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
            role: 'user',
          },
          {
@@ -507,7 +521,7 @@ describe('google contextBuilders', () => {

        expect(contents).toEqual([
          {
-            parts: [{ text: 'First question' }],
+            parts: [{ text: 'First question', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
            role: 'user',
          },
          {
@@ -534,7 +548,7 @@ describe('google contextBuilders', () => {
            role: 'user',
          },
          {
-            parts: [{ text: 'Second question' }],
+            parts: [{ text: 'Second question', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
            role: 'user',
          },
          {
@@ -604,11 +618,16 @@ describe('google contextBuilders', () => {

        expect(contents).toEqual([
          {
-            parts: [{ text: '<plugins>Web Browsing plugin available</plugins>' }],
+            parts: [
+              {
+                text: '<plugins>Web Browsing plugin available</plugins>',
+                thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
+              },
+            ],
            role: 'user',
          },
          {
-            parts: [{ text: '杭州天气如何' }],
+            parts: [{ text: '杭州天气如何', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
            role: 'user',
          },
          {
@@ -635,7 +654,7 @@ describe('google contextBuilders', () => {
            role: 'user',
          },
          {
-            parts: [{ text: 'Please try again' }],
+            parts: [{ text: 'Please try again', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
            role: 'user',
          },
        ]);
@@ -651,7 +670,7 @@ describe('google contextBuilders', () => {
      const converted = await buildGoogleMessage(message);

      expect(converted).toEqual({
-        parts: [{ text: '' }],
+        parts: [{ text: '', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
        role: 'user',
      });
    });
@@ -690,7 +709,12 @@ describe('google contextBuilders', () => {
      const contents = await buildGoogleMessages(messages);

      expect(contents).toHaveLength(1);
-      expect(contents).toEqual([{ parts: [{ text: 'Hello' }], role: 'user' }]);
+      expect(contents).toEqual([
+        {
+          parts: [{ text: 'Hello', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'user',
+        },
+      ]);
    });

    it('should not modify the length if model is gemini-1.5-pro', async () => {
@@ -703,8 +727,14 @@ describe('google contextBuilders', () => {

      expect(contents).toHaveLength(2);
      expect(contents).toEqual([
-        { parts: [{ text: 'Hello' }], role: 'user' },
-        { parts: [{ text: 'Hi' }], role: 'model' },
+        {
+          parts: [{ text: 'Hello', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'user',
+        },
+        {
+          parts: [{ text: 'Hi', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'model',
+        },
      ]);
    });

@@ -730,7 +760,13 @@ describe('google contextBuilders', () => {
      expect(contents).toHaveLength(1);
      expect(contents).toEqual([
        {
-          parts: [{ text: 'Hello' }, { inlineData: { data: '...', mimeType: 'image/png' } }],
+          parts: [
+            { text: 'Hello', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE },
+            {
+              inlineData: { data: '...', mimeType: 'image/png' },
+              thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
+            },
+          ],
          role: 'user',
        },
      ]);
@@ -801,8 +837,14 @@ describe('google contextBuilders', () => {

      expect(contents).toHaveLength(2);
      expect(contents).toEqual([
-        { parts: [{ text: 'Hello' }], role: 'user' },
-        { parts: [{ text: 'Hi' }], role: 'model' },
+        {
+          parts: [{ text: 'Hello', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'user',
+        },
+        {
+          parts: [{ text: 'Hi', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'model',
+        },
      ]);
    });

@@ -817,8 +859,14 @@ describe('google contextBuilders', () => {

      expect(contents).toHaveLength(2);
      expect(contents).toEqual([
-        { parts: [{ text: 'Hello' }], role: 'user' },
-        { parts: [{ text: 'Hi' }], role: 'model' },
+        {
+          parts: [{ text: 'Hello', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'user',
+        },
+        {
+          parts: [{ text: 'Hi', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'model',
+        },
      ]);
    });

@@ -857,8 +905,14 @@ describe('google contextBuilders', () => {
      const contents = await buildGoogleMessages(messages);

      expect(contents).toEqual([
-        { parts: [{ text: 'system prompt' }], role: 'user' },
-        { parts: [{ text: 'LobeChat 最新版本' }], role: 'user' },
+        {
+          parts: [{ text: 'system prompt', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'user',
+        },
+        {
+          parts: [{ text: 'LobeChat 最新版本', thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }],
+          role: 'user',
+        },
        {
          parts: [
            {
--- a/packages/model-runtime/src/core/contextBuilders/google.ts
+++ b/packages/model-runtime/src/core/contextBuilders/google.ts
@@ -29,7 +29,10 @@ export const buildGooglePart = async (
    }

    case 'text': {
-      return { text: content.text };
+      return {
+        text: content.text,
+        thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
+      };
    }

    case 'image_url': {
@@ -42,6 +45,7 @@ export const buildGooglePart = async (

        return {
          inlineData: { data: base64, mimeType: mimeType || 'image/png' },
+          thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
        };
      }

@@ -50,6 +54,7 @@ export const buildGooglePart = async (

        return {
          inlineData: { data: base64, mimeType },
+          thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
        };
      }

@@ -66,6 +71,7 @@ export const buildGooglePart = async (

        return {
          inlineData: { data: base64, mimeType: mimeType || 'video/mp4' },
+          thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
        };
      }

@@ -76,6 +82,7 @@ export const buildGooglePart = async (

        return {
          inlineData: { data: base64, mimeType },
+          thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE,
        };
      }

@@ -126,7 +133,8 @@ export const buildGoogleMessage = async (
  }

  const getParts = async () => {
-    if (typeof content === 'string') return [{ text: content }];
+    if (typeof content === 'string')
+      return [{ text: content, thoughtSignature: GEMINI_MAGIC_THOUGHT_SIGNATURE }];

    const parts = await Promise.all(content.map(async (c) => await buildGooglePart(c)));
    return parts.filter(Boolean) as Part[];
--- a/packages/model-runtime/src/core/streams/google/google-ai.test.ts
+++ b/packages/model-runtime/src/core/streams/google/google-ai.test.ts
@@ -251,16 +251,16 @@ describe('GoogleGenerativeAIStream', () => {
      expect(chunks).toEqual(
        [
          'id: chat_1',
-          'event: text',
-          'data: "234"\n',
+          'event: content_part',
+          'data: {"content":"234","partType":"text"}\n',

          'id: chat_1',
          'event: text',
          'data: ""\n',

          'id: chat_1',
-          'event: text',
-          `data: "567890\\n"\n`,
+          'event: content_part',
+          `data: {"content":"567890\\n","partType":"text"}\n`,
          // stop
          'id: chat_1',
          'event: stop',
@@ -376,20 +376,20 @@ describe('GoogleGenerativeAIStream', () => {
      expect(chunks).toEqual(
        [
          'id: chat_1',
-          'event: reasoning',
-          'data: "**Understanding the Conditional Logic**\\n\\n"\n',
+          'event: reasoning_part',
+          'data: {"content":"**Understanding the Conditional Logic**\\n\\n","inReasoning":true,"partType":"text"}\n',

          'id: chat_1',
-          'event: reasoning',
-          `data: "**Finalizing Interpretation**\\n\\n"\n`,
+          'event: reasoning_part',
+          `data: {"content":"**Finalizing Interpretation**\\n\\n","inReasoning":true,"partType":"text"}\n`,

          'id: chat_1',
-          'event: text',
-          `data: "简单来说，"\n`,
+          'event: content_part',
+          `data: {"content":"简单来说，","partType":"text"}\n`,

          'id: chat_1',
-          'event: text',
-          `data: "文本内容。"\n`,
+          'event: content_part',
+          `data: {"content":"文本内容。","partType":"text"}\n`,
          // stop
          'id: chat_1',
          'event: stop',
@@ -471,12 +471,12 @@ describe('GoogleGenerativeAIStream', () => {
      expect(chunks).toEqual(
        [
          'id: chat_1',
-          'event: text',
-          'data: "234"\n',
+          'event: content_part',
+          'data: {"content":"234","partType":"text"}\n',

          'id: chat_1',
-          'event: text',
-          `data: "567890\\n"\n`,
+          'event: content_part',
+          `data: {"content":"567890\\n","partType":"text"}\n`,
          // stop
          'id: chat_1',
          'event: stop',
@@ -1166,8 +1166,8 @@ describe('GoogleGenerativeAIStream', () => {
      expect(chunks).toEqual(
        [
          'id: chat_1',
-          'event: text',
-          'data: "你好！很高兴为你服务。请问有什么我可以帮你的吗？\\n\\n无论是回答问题、协助写作、翻译，还是随便聊聊，我都随时待命！"\n',
+          'event: content_part',
+          'data: {"content":"你好！很高兴为你服务。请问有什么我可以帮你的吗？\\n\\n无论是回答问题、协助写作、翻译，还是随便聊聊，我都随时待命！","partType":"text"}\n',

          'id: chat_1',
          'event: stop',
@@ -1286,8 +1286,8 @@ describe('GoogleGenerativeAIStream', () => {
      expect(chunks).toEqual(
        [
          'id: chat_1',
-          'event: text',
-          'data: "Here is my answer"\n',
+          'event: content_part',
+          'data: {"content":"Here is my answer","partType":"text","thoughtSignature":"sig123"}\n',

          'id: chat_1',
          'event: stop',
@@ -1300,4 +1300,435 @@ describe('GoogleGenerativeAIStream', () => {
      );
    });
  });
+
+  describe('Multimodal parts (reasoning_part and content_part)', () => {
+    it('should handle mixed reasoning text and reasoning image parts', async () => {
+      vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
+
+      const data = [
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: "**Clarifying the Core Concept**\n\nI'm now focusing on the visual metaphor. I plan to depict Agent Runtime as a software environment that manages and executes agents' tasks, similar to how an operating system functions. I aim to create an informative and intuitive infographic.\n\n\n",
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: '**Developing Visual Representation**\n\nI\'m now iterating on the visual representation. The "command center" metaphor is proving useful. I\'m focusing on the interplay of the core components: the central engine coordinates perception, memory, planning, and action, with tools and plugins as extensions. The goal is to clearly show the flow of information through the system, from input to output, using visual cues. The aesthetic aims for a futuristic, tech-inspired look with glowing lines and circuit board elements, using a palette of blues, purples, and oranges.\n\n\n',
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: "**Constructing the Architecture**\n\nI'm presently building out the architecture of the infographic. I've broken down \"Agent Runtime\" into its core components and I'm designing the visual relationships between them.  The central engine will be the focal point, with modules for perception, memory, planning, action, and tools radiating outwards. My aim is to illustrate the workflow from input to output clearly. I'll utilize arrows to represent the flow of data and instructions between each module.\n\n\n",
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    inlineData: {
+                      mimeType: 'image/jpeg',
+                      data: '/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQBiUgnjeAMHrkrk/mMk8k57YpwnCsBtzkcgk+/ONuO3QE4JJ3ccuJ373EbEg5KgDkAg4xtySNrHv14HYm9ne7au7JpLRfm+vn63Emtk9FZWv1t+Ntt9L9Wj//2Q==',
+                    },
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: '**Constructing an Infographic**\n\nI\'ve successfully created an infographic depicting an "Agent Runtime." The design employs a tech-inspired circuit board aesthetic, placing the core engine at the center. I\'ve clearly represented six essential modules: perception, memory, planning, action, tools, and learning. Arrows and text annotations vividly illustrate the data flow and processing.\n\n\n',
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: "**Defining Agent Runtime Modules**\n\nI'm making progress clarifying the architecture of an \"Agent Runtime\" system. I've designed an infographic with a circuit board aesthetic, centered on the core engine. Six key modules are now visualized: perception, memory, planning, action, tools, and learning. I've incorporated arrows and annotations to show data flow effectively.\n\n\n",
+                    thought: true,
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            totalTokenCount: 9,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    inlineData: {
+                      mimeType: 'image/jpeg',
+                      data: '/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBA2Q==',
+                    },
+                    thoughtSignature:
+                      'EueybArjsmwB0e2Kby+QPRkacnmPuV+CqMr6tiey3M5BHLHgIiggQOMeFmnKzsoux6PI6dQMgmdbXE1OTLLcWUmUD1CgFn+C2VdI09FpHrVhxVAtSk/zFVSlsjfCuANxtkP8tCDppVZqIya0QYjzg5K1fEO0m42CZX2/MHyqL8NjzR0lT8ENdoV3RSaK2tXqPH45uIb6nGeBSuX1n2EUMzO',
+                  },
+                ],
+                role: 'model',
+              },
+              finishReason: 'STOP',
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 9,
+            candidatesTokenCount: 1358,
+            totalTokenCount: 1728,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 9 }],
+            candidatesTokensDetails: [{ modality: 'IMAGE', tokenCount: 1120 }],
+            thoughtsTokenCount: 361,
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+          responseId: 'MRYkaeWsL5bSjMcPlsLRiQo',
+        },
+      ];
+
+      const mockGoogleStream = new ReadableStream({
+        start(controller) {
+          data.forEach((item) => {
+            controller.enqueue(item);
+          });
+          controller.close();
+        },
+      });
+
+      const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
+      const chunks = await decodeStreamChunks(protocolStream);
+
+      expect(chunks).toEqual(
+        [
+          // First reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Clarifying the Core Concept**\\n\\nI\'m now focusing on the visual metaphor. I plan to depict Agent Runtime as a software environment that manages and executes agents\' tasks, similar to how an operating system functions. I aim to create an informative and intuitive infographic.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+
+          // Second reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Developing Visual Representation**\\n\\nI\'m now iterating on the visual representation. The \\"command center\\" metaphor is proving useful. I\'m focusing on the interplay of the core components: the central engine coordinates perception, memory, planning, and action, with tools and plugins as extensions. The goal is to clearly show the flow of information through the system, from input to output, using visual cues. The aesthetic aims for a futuristic, tech-inspired look with glowing lines and circuit board elements, using a palette of blues, purples, and oranges.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+
+          // Third reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Constructing the Architecture**\\n\\nI\'m presently building out the architecture of the infographic. I\'ve broken down \\"Agent Runtime\\" into its core components and I\'m designing the visual relationships between them.  The central engine will be the focal point, with modules for perception, memory, planning, action, and tools radiating outwards. My aim is to illustrate the workflow from input to output clearly. I\'ll utilize arrows to represent the flow of data and instructions between each module.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+
+          // First reasoning image
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQBiUgnjeAMHrkrk/mMk8k57YpwnCsBtzkcgk+/ONuO3QE4JJ3ccuJ373EbEg5KgDkAg4xtySNrHv14HYm9ne7au7JpLRfm+vn63Emtk9FZWv1t+Ntt9L9Wj//2Q==","inReasoning":true,"mimeType":"image/jpeg","partType":"image"}\n',
+
+          // Fourth reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Constructing an Infographic**\\n\\nI\'ve successfully created an infographic depicting an \\"Agent Runtime.\\" The design employs a tech-inspired circuit board aesthetic, placing the core engine at the center. I\'ve clearly represented six essential modules: perception, memory, planning, action, tools, and learning. Arrows and text annotations vividly illustrate the data flow and processing.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+
+          // Fifth reasoning text
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"**Defining Agent Runtime Modules**\\n\\nI\'m making progress clarifying the architecture of an \\"Agent Runtime\\" system. I\'ve designed an infographic with a circuit board aesthetic, centered on the core engine. Six key modules are now visualized: perception, memory, planning, action, tools, and learning. I\'ve incorporated arrows and annotations to show data flow effectively.\\n\\n\\n","inReasoning":true,"partType":"text"}\n',
+
+          // Content image (with thoughtSignature but not thought:true)
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAEBAQEBAQEBA2Q==","mimeType":"image/jpeg","partType":"image","thoughtSignature":"EueybArjsmwB0e2Kby+QPRkacnmPuV+CqMr6tiey3M5BHLHgIiggQOMeFmnKzsoux6PI6dQMgmdbXE1OTLLcWUmUD1CgFn+C2VdI09FpHrVhxVAtSk/zFVSlsjfCuANxtkP8tCDppVZqIya0QYjzg5K1fEO0m42CZX2/MHyqL8NjzR0lT8ENdoV3RSaK2tXqPH45uIb6nGeBSuX1n2EUMzO"}\n',
+
+          // stop
+          'id: chat_1',
+          'event: stop',
+          'data: "STOP"\n',
+
+          // usage
+          'id: chat_1',
+          'event: usage',
+          'data: {"inputTextTokens":9,"outputImageTokens":1120,"outputReasoningTokens":361,"outputTextTokens":238,"totalInputTokens":9,"totalOutputTokens":1719,"totalTokens":1728}\n',
+        ].map((i) => i + '\n'),
+      );
+    });
+
+    it('should handle content text and image parts without reasoning', async () => {
+      vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
+
+      const data = [
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: 'This is the description: ',
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 5,
+            totalTokenCount: 5,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    inlineData: {
+                      mimeType: 'image/png',
+                      data: 'iVBORw0KGgoAAAANSUhEUgAAAAUA',
+                    },
+                  },
+                ],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 5,
+            totalTokenCount: 5,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+        },
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: ' an example.',
+                  },
+                ],
+                role: 'model',
+              },
+              finishReason: 'STOP',
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 5,
+            candidatesTokenCount: 10,
+            totalTokenCount: 15,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 5 }],
+            candidatesTokensDetails: [{ modality: 'TEXT', tokenCount: 10 }],
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+        },
+      ];
+
+      const mockGoogleStream = new ReadableStream({
+        start(controller) {
+          data.forEach((item) => {
+            controller.enqueue(item);
+          });
+          controller.close();
+        },
+      });
+
+      const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
+      const chunks = await decodeStreamChunks(protocolStream);
+
+      expect(chunks).toEqual(
+        [
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"This is the description: ","partType":"text"}\n',
+
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"iVBORw0KGgoAAAANSUhEUgAAAAUA","mimeType":"image/png","partType":"image"}\n',
+
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":" an example.","partType":"text"}\n',
+
+          'id: chat_1',
+          'event: stop',
+          'data: "STOP"\n',
+
+          'id: chat_1',
+          'event: usage',
+          'data: {"inputTextTokens":5,"outputImageTokens":0,"outputTextTokens":10,"totalInputTokens":5,"totalOutputTokens":10,"totalTokens":15}\n',
+        ].map((i) => i + '\n'),
+      );
+    });
+
+    it('should handle mixed reasoning and content parts in single chunk', async () => {
+      vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
+
+      const data = [
+        {
+          candidates: [
+            {
+              content: {
+                parts: [
+                  {
+                    text: 'Analyzing the request...',
+                    thought: true,
+                  },
+                  {
+                    text: 'Here is the answer: ',
+                  },
+                  {
+                    inlineData: {
+                      mimeType: 'image/png',
+                      data: 'base64data',
+                    },
+                  },
+                ],
+                role: 'model',
+              },
+              finishReason: 'STOP',
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 10,
+            candidatesTokenCount: 20,
+            totalTokenCount: 30,
+            promptTokensDetails: [{ modality: 'TEXT', tokenCount: 10 }],
+            thoughtsTokenCount: 5,
+          },
+          modelVersion: 'gemini-3-pro-image-preview',
+        },
+      ];
+
+      const mockGoogleStream = new ReadableStream({
+        start(controller) {
+          data.forEach((item) => {
+            controller.enqueue(item);
+          });
+          controller.close();
+        },
+      });
+
+      const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
+      const chunks = await decodeStreamChunks(protocolStream);
+
+      expect(chunks).toEqual(
+        [
+          'id: chat_1',
+          'event: reasoning_part',
+          'data: {"content":"Analyzing the request...","inReasoning":true,"partType":"text"}\n',
+
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"Here is the answer: ","partType":"text"}\n',
+
+          'id: chat_1',
+          'event: content_part',
+          'data: {"content":"base64data","mimeType":"image/png","partType":"image"}\n',
+
+          'id: chat_1',
+          'event: stop',
+          'data: "STOP"\n',
+
+          'id: chat_1',
+          'event: usage',
+          'data: {"inputTextTokens":10,"outputImageTokens":0,"outputReasoningTokens":5,"outputTextTokens":20,"totalInputTokens":10,"totalOutputTokens":25,"totalTokens":30}\n',
+        ].map((i) => i + '\n'),
+      );
+    });
+  });
 });
--- a/packages/model-runtime/src/core/streams/google/index.ts
+++ b/packages/model-runtime/src/core/streams/google/index.ts
@@ -7,6 +7,7 @@ import { convertGoogleAIUsage } from '../../usageConverters/google-ai';
 import {
  ChatPayloadForTransformStream,
  StreamContext,
+  StreamPartChunkData,
  StreamProtocolChunk,
  StreamToolCallChunkData,
  createCallbacksTransformer,
@@ -114,12 +115,121 @@ const transformGoogleGenerativeAIStream = (
      .join('') || '';

  if (candidate) {
-    // 首先检查是否为 reasoning 内容 (thought: true)
-    if (Array.isArray(candidate.content?.parts) && candidate.content.parts.length > 0) {
+    // Check if this response contains reasoning or multimodal content
+    const parts = candidate.content?.parts || [];
+    const hasReasoningParts = parts.some((p: any) => p.thought === true);
+    const hasImageParts = parts.some((p: any) => p.inlineData);
+    const hasThoughtSignature = parts.some((p: any) => p.thoughtSignature);
+    const hasThoughtsInMetadata = (usageMetadata as any)?.thoughtsTokenCount > 0;
+
+    // Check model version to determine if new format should be used
+    const modelVersion = (chunk as any).modelVersion || '';
+    const isGemini25Plus = modelVersion.includes('gemini-2.5') || modelVersion.includes('gemini-3');
+    const isGemini3Model =
+      modelVersion.includes('gemini-3') || modelVersion.includes('image-preview');
+
+    // Check if this is the old single-image scenario (single image part with finishReason)
+    // This should use the legacy base64_image event format (only for gemini-2.0 and earlier)
+    const isSingleImageWithFinish =
+      parts.length === 1 &&
+      hasImageParts &&
+      !hasReasoningParts &&
+      candidate.finishReason &&
+      !isGemini25Plus;
+
+    // Check if this has grounding metadata (should use legacy text + grounding events)
+    const hasGroundingMetadata = !!candidate.groundingMetadata?.groundingChunks;
+
+    // Use content_part/reasoning_part events when:
+    // 1. There are reasoning parts in current chunk (thought: true)
+    // 2. There are multiple parts with images (multimodal content)
+    // 3. There are thoughtSignature in parts (reasoning metadata attached to content)
+    // 4. There is thoughtsTokenCount in metadata (indicates response contains reasoning)
+    // 5. This is Gemini 3 model with image generation (always use new format for consistency)
+    // BUT NOT for:
+    // - The legacy single-image scenario
+    // - Grounding metadata scenario (uses legacy text + grounding events)
+    const shouldUseMultimodalProcessing =
+      (hasReasoningParts ||
+        (hasImageParts && parts.length > 1) ||
+        hasThoughtSignature ||
+        hasThoughtsInMetadata ||
+        isGemini3Model) &&
+      !isSingleImageWithFinish &&
+      !hasGroundingMetadata;
+
+    // Process multimodal parts (text and images in reasoning or content)
+    if (
+      shouldUseMultimodalProcessing &&
+      Array.isArray(candidate.content?.parts) &&
+      candidate.content.parts.length > 0
+    ) {
+      const results: StreamProtocolChunk[] = [];
+
      for (const part of candidate.content.parts) {
+        // 1. Reasoning text part
        if (part && part.text && part.thought === true) {
-          return { data: part.text, id: context.id, type: 'reasoning' };
+          results.push({
+            data: {
+              content: part.text,
+              inReasoning: true,
+              partType: 'text',
+              thoughtSignature: part.thoughtSignature,
+            } as StreamPartChunkData,
+            id: context.id,
+            type: 'reasoning_part',
+          });
        }
+
+        // 2. Reasoning image part
+        else if (part && part.inlineData && part.thought === true) {
+          results.push({
+            data: {
+              content: part.inlineData.data,
+              inReasoning: true,
+              mimeType: part.inlineData.mimeType,
+              partType: 'image',
+              thoughtSignature: part.thoughtSignature,
+            } as StreamPartChunkData,
+            id: context.id,
+            type: 'reasoning_part',
+          });
+        }
+
+        // 3. Content text part
+        else if (part && part.text && !part.thought) {
+          results.push({
+            data: {
+              content: part.text,
+              partType: 'text',
+              thoughtSignature: part.thoughtSignature,
+            } as StreamPartChunkData,
+            id: context.id,
+            type: 'content_part',
+          });
+        }
+
+        // 4. Content image part
+        else if (part && part.inlineData && !part.thought) {
+          results.push({
+            data: {
+              content: part.inlineData.data,
+              mimeType: part.inlineData.mimeType,
+              partType: 'image',
+              thoughtSignature: part.thoughtSignature,
+            } as StreamPartChunkData,
+            id: context.id,
+            type: 'content_part',
+          });
+        }
+      }
+
+      // If we found multimodal parts, return them with usage chunks
+      if (results.length > 0) {
+        if (candidate.finishReason && usageMetadata) {
+          results.push(...usageChunks);
+        }
+        return results;
      }
    }

--- a/packages/model-runtime/src/core/streams/protocol.ts
+++ b/packages/model-runtime/src/core/streams/protocol.ts
@@ -77,6 +77,10 @@ export interface StreamProtocolChunk {
    | 'reasoning_signature'
    // flagged reasoning signature
    | 'flagged_reasoning_signature'
+    // multimodal content part in reasoning
+    | 'reasoning_part'
+    // multimodal content part in content
+    | 'content_part'
    // Search or Grounding
    | 'grounding'
    // stop signal
@@ -91,6 +95,21 @@ export interface StreamProtocolChunk {
    | 'data';
 }

+/**
+ * Stream content part chunk data for multimodal support
+ */
+export interface StreamPartChunkData {
+  content: string;
+  // whether this part is in reasoning or regular content
+  inReasoning: boolean;
+  // image MIME type
+  mimeType?: string;
+  // text content or base64 image data
+  partType: 'text' | 'image';
+  // Optional signature for reasoning verification (Google Gemini feature)
+  thoughtSignature?: string;
+}
+
 export interface StreamToolCallChunkData {
  function?: {
    arguments?: string;
--- a/packages/types/src/message/common/base.ts
+++ b/packages/types/src/message/common/base.ts
@@ -26,14 +26,40 @@ export interface ChatCitationItem {
  url: string;
 }

+/**
+ * Message content part types for multimodal content support
+ */
+export interface MessageContentPartText {
+  text: string;
+  thoughtSignature?: string;
+  type: 'text';
+}
+
+export interface MessageContentPartImage {
+  image: string;
+  thoughtSignature?: string;
+  type: 'image';
+}
+
+export type MessageContentPart = MessageContentPartText | MessageContentPartImage;
+
 export interface ModelReasoning {
+  /**
+   * Reasoning content, can be plain string or serialized JSON array of MessageContentPart[]
+   */
  content?: string;
  duration?: number;
+  /**
+   * Flag indicating if content is multimodal (serialized MessageContentPart[])
+   */
+  isMultimodal?: boolean;
  signature?: string;
+  tempDisplayContent?: MessageContentPart[];
 }

 export const ModelReasoningSchema = z.object({
  content: z.string().optional(),
  duration: z.number().optional(),
+  isMultimodal: z.boolean().optional(),
  signature: z.string().optional(),
 });
--- a/packages/types/src/message/common/metadata.ts
+++ b/packages/types/src/message/common/metadata.ts
@@ -78,6 +78,7 @@ export const ModelPerformanceSchema = z.object({
 export const MessageMetadataSchema = ModelUsageSchema.merge(ModelPerformanceSchema).extend({
  collapsed: z.boolean().optional(),
  inspectExpanded: z.boolean().optional(),
+  isMultimodal: z.boolean().optional(),
 });

 export interface ModelUsage extends ModelTokensUsage {
@@ -123,4 +124,10 @@ export interface MessageMetadata extends ModelUsage, ModelPerformance {
  compare?: boolean;
  usage?: ModelUsage;
  performance?: ModelPerformance;
+  /**
+   * Flag indicating if message content is multimodal (serialized MessageContentPart[])
+   */
+  isMultimodal?: boolean;
+  // message content is multimodal, display content in the streaming, won't save to db
+  tempDisplayContent?: string;
 }
--- a/packages/utils/src/index.ts
+++ b/packages/utils/src/index.ts
@@ -5,6 +5,7 @@ export * from './format';
 export * from './imageToBase64';
 export * from './keyboard';
 export * from './merge';
+export * from './multimodalContent';
 export * from './number';
 export * from './object';
 export * from './pricing';
--- a/packages/utils/src/multimodalContent.ts
+++ b/packages/utils/src/multimodalContent.ts
@@ -0,0 +1,25 @@
+import { MessageContentPart } from '@lobechat/types';
+
+/**
+ * Serialize message content parts to JSON string for storage
+ */
+export function serializePartsForStorage(parts: MessageContentPart[]): string {
+  return JSON.stringify(parts);
+}
+
+/**
+ * Deserialize content string to message content parts
+ * Returns null if content is not valid JSON array of parts
+ */
+export function deserializeParts(content: string): MessageContentPart[] | null {
+  try {
+    const parsed = JSON.parse(content);
+    // Validate it's an array with valid part structure
+    if (Array.isArray(parsed) && parsed.length > 0 && parsed[0]?.type) {
+      return parsed as MessageContentPart[];
+    }
+  } catch {
+    // Not JSON, treat as plain text
+  }
+  return null;
+}
--- a/src/components/Thinking/index.tsx
+++ b/src/components/Thinking/index.tsx
@@ -4,7 +4,7 @@ import { createStyles } from 'antd-style';
 import { AnimatePresence, motion } from 'framer-motion';
 import { AtomIcon } from 'lucide-react';
 import { rgba } from 'polished';
-import { CSSProperties, RefObject, memo, useEffect, useRef, useState } from 'react';
+import { CSSProperties, ReactNode, RefObject, memo, useEffect, useRef, useState } from 'react';
 import { useTranslation } from 'react-i18next';
 import { Flexbox } from 'react-layout-kit';

@@ -76,7 +76,7 @@ const useStyles = createStyles(({ css, token }) => ({

 interface ThinkingProps {
  citations?: ChatCitationItem[];
-  content?: string;
+  content?: string | ReactNode;
  duration?: number;
  style?: CSSProperties;
  thinking?: boolean;
@@ -158,7 +158,7 @@ const Thinking = memo<ThinkingProps>((props) => {
          </Flexbox>
        )}
        <Flexbox gap={4} horizontal>
-          {showDetail && content && (
+          {showDetail && content && typeof content === 'string' && (
            <div
              onClick={(event) => {
                event.stopPropagation();
--- a/src/features/ChatList/Messages/Assistant/DisplayContent.tsx
+++ b/src/features/ChatList/Messages/Assistant/DisplayContent.tsx
@@ -0,0 +1,44 @@
+import { deserializeParts } from '@lobechat/utils';
+import { Markdown, MarkdownProps } from '@lobehub/ui';
+import { memo } from 'react';
+
+import BubblesLoading from '@/components/BubblesLoading';
+import { LOADING_FLAT } from '@/const/message';
+import { RichContentRenderer } from '@/features/ChatList/components/RichContentRenderer';
+import { normalizeThinkTags, processWithArtifact } from '@/features/ChatList/utils/markdown';
+
+const MessageContent = memo<{
+  addIdOnDOM?: boolean;
+  content: string;
+  hasImages?: boolean;
+  isMultimodal?: boolean;
+  isToolCallGenerating?: boolean;
+  markdownProps?: Omit<MarkdownProps, 'className' | 'style' | 'children'>;
+  tempDisplayContent?: string;
+}>(
+  ({
+    markdownProps,
+    content,
+    isToolCallGenerating,
+    hasImages,
+    isMultimodal,
+    tempDisplayContent,
+  }) => {
+    const message = normalizeThinkTags(processWithArtifact(content));
+    if (isToolCallGenerating) return;
+
+    if ((!content && !hasImages) || content === LOADING_FLAT) return <BubblesLoading />;
+
+    const contentParts = isMultimodal ? deserializeParts(tempDisplayContent || content) : null;
+
+    return contentParts ? (
+      <RichContentRenderer parts={contentParts} />
+    ) : (
+      <Markdown {...markdownProps} variant={'chat'}>
+        {message}
+      </Markdown>
+    );
+  },
+);
+
+export default MessageContent;
--- a/src/features/ChatList/Messages/Assistant/MessageBody.tsx
+++ b/src/features/ChatList/Messages/Assistant/MessageBody.tsx
@@ -0,0 +1,96 @@
+import { LOADING_FLAT } from '@lobechat/const';
+import { UIChatMessage } from '@lobechat/types';
+import { MarkdownProps } from '@lobehub/ui';
+import { ReactNode, memo } from 'react';
+import { Flexbox } from 'react-layout-kit';
+
+import { useChatStore } from '@/store/chat';
+import { aiChatSelectors, messageStateSelectors } from '@/store/chat/selectors';
+
+import { DefaultMessage } from '../Default';
+import ImageFileListViewer from '../User/ImageFileListViewer';
+import { CollapsedMessage } from './CollapsedMessage';
+import MessageContent from './DisplayContent';
+import FileChunks from './FileChunks';
+import IntentUnderstanding from './IntentUnderstanding';
+import Reasoning from './Reasoning';
+import SearchGrounding from './SearchGrounding';
+
+export const AssistantMessageBody = memo<
+  UIChatMessage & {
+    editableContent: ReactNode;
+    markdownProps?: Omit<MarkdownProps, 'className' | 'style' | 'children'>;
+  }
+>(
+  ({
+    id,
+    tools,
+    content,
+    chunksList,
+    search,
+    imageList,
+    metadata,
+    editableContent,
+    markdownProps,
+    ...props
+  }) => {
+    const [editing, generating, isCollapsed] = useChatStore((s) => [
+      messageStateSelectors.isMessageEditing(id)(s),
+      messageStateSelectors.isMessageGenerating(id)(s),
+      messageStateSelectors.isMessageCollapsed(id)(s),
+    ]);
+
+    const isToolCallGenerating = generating && (content === LOADING_FLAT || !content) && !!tools;
+
+    const isReasoning = useChatStore(aiChatSelectors.isMessageInReasoning(id));
+
+    const isIntentUnderstanding = useChatStore(aiChatSelectors.isIntentUnderstanding(id));
+
+    const showSearch = !!search && !!search.citations?.length;
+    const showImageItems = !!imageList && imageList.length > 0;
+
+    // remove \n to avoid empty content
+    // refs: https://github.com/lobehub/lobe-chat/pull/6153
+    const showReasoning =
+      (!!props.reasoning && props.reasoning.content?.trim() !== '') ||
+      (!props.reasoning && isReasoning);
+
+    const showFileChunks = !!chunksList && chunksList.length > 0;
+
+    if (editing)
+      return (
+        <DefaultMessage
+          content={content}
+          editableContent={editableContent}
+          id={id}
+          isToolCallGenerating={isToolCallGenerating}
+          {...props}
+        />
+      );
+
+    if (isCollapsed) return <CollapsedMessage content={content} id={id} />;
+
+    return (
+      <Flexbox gap={8} id={id}>
+        {showSearch && (
+          <SearchGrounding citations={search?.citations} searchQueries={search?.searchQueries} />
+        )}
+        {showFileChunks && <FileChunks data={chunksList} />}
+        {showReasoning && <Reasoning {...props.reasoning} id={id} />}
+        {isIntentUnderstanding ? (
+          <IntentUnderstanding />
+        ) : (
+          <MessageContent
+            content={content}
+            hasImages={showImageItems}
+            isMultimodal={metadata?.isMultimodal}
+            isToolCallGenerating={isToolCallGenerating}
+            markdownProps={markdownProps}
+            tempDisplayContent={metadata?.tempDisplayContent}
+          />
+        )}
+        {showImageItems && <ImageFileListViewer items={imageList} />}
+      </Flexbox>
+    );
+  },
+);
--- a/src/features/ChatList/Messages/Assistant/MessageContent.tsx
+++ b/src/features/ChatList/Messages/Assistant/MessageContent.tsx
@@ -1,78 +0,0 @@
-import { LOADING_FLAT } from '@lobechat/const';
-import { UIChatMessage } from '@lobechat/types';
-import { ReactNode, memo } from 'react';
-import { Flexbox } from 'react-layout-kit';
-
-import { useChatStore } from '@/store/chat';
-import { aiChatSelectors, messageStateSelectors } from '@/store/chat/selectors';
-
-import { DefaultMessage } from '../Default';
-import ImageFileListViewer from '../User/ImageFileListViewer';
-import { CollapsedMessage } from './CollapsedMessage';
-import FileChunks from './FileChunks';
-import IntentUnderstanding from './IntentUnderstanding';
-import Reasoning from './Reasoning';
-import SearchGrounding from './SearchGrounding';
-
-export const AssistantMessageContent = memo<
-  UIChatMessage & {
-    editableContent: ReactNode;
-  }
->(({ id, tools, content, chunksList, search, imageList, ...props }) => {
-  const [editing, generating, isCollapsed] = useChatStore((s) => [
-    messageStateSelectors.isMessageEditing(id)(s),
-    messageStateSelectors.isMessageGenerating(id)(s),
-    messageStateSelectors.isMessageCollapsed(id)(s),
-  ]);
-
-  const isToolCallGenerating = generating && (content === LOADING_FLAT || !content) && !!tools;
-
-  const isReasoning = useChatStore(aiChatSelectors.isMessageInReasoning(id));
-
-  const isIntentUnderstanding = useChatStore(aiChatSelectors.isIntentUnderstanding(id));
-
-  const showSearch = !!search && !!search.citations?.length;
-  const showImageItems = !!imageList && imageList.length > 0;
-
-  // remove \n to avoid empty content
-  // refs: https://github.com/lobehub/lobe-chat/pull/6153
-  const showReasoning =
-    (!!props.reasoning && props.reasoning.content?.trim() !== '') ||
-    (!props.reasoning && isReasoning);
-
-  const showFileChunks = !!chunksList && chunksList.length > 0;
-
-  if (editing)
-    return (
-      <DefaultMessage
-        content={content}
-        id={id}
-        isToolCallGenerating={isToolCallGenerating}
-        {...props}
-      />
-    );
-
-  if (isCollapsed) return <CollapsedMessage content={content} id={id} />;
-
-  return (
-    <Flexbox gap={8} id={id}>
-      {showSearch && (
-        <SearchGrounding citations={search?.citations} searchQueries={search?.searchQueries} />
-      )}
-      {showFileChunks && <FileChunks data={chunksList} />}
-      {showReasoning && <Reasoning {...props.reasoning} id={id} />}
-      {isIntentUnderstanding ? (
-        <IntentUnderstanding />
-      ) : (
-        <DefaultMessage
-          addIdOnDOM={false}
-          content={content}
-          id={id}
-          isToolCallGenerating={isToolCallGenerating}
-          {...props}
-        />
-      )}
-      {showImageItems && <ImageFileListViewer items={imageList} />}
-    </Flexbox>
-  );
-});
--- a/src/features/ChatList/Messages/Assistant/Reasoning/index.tsx
+++ b/src/features/ChatList/Messages/Assistant/Reasoning/index.tsx
@@ -1,3 +1,5 @@
+import { MessageContentPart } from '@lobechat/types';
+import { deserializeParts } from '@lobechat/utils';
 import { memo } from 'react';

 import Thinking from '@/components/Thinking';
@@ -6,24 +8,35 @@ import { aiChatSelectors } from '@/store/chat/selectors';
 import { useUserStore } from '@/store/user';
 import { userGeneralSettingsSelectors } from '@/store/user/selectors';

+import { RichContentRenderer } from '../../../components/RichContentRenderer';
+
 interface ReasoningProps {
  content?: string;
  duration?: number;
  id: string;
+  isMultimodal?: boolean;
+  tempDisplayContent?: MessageContentPart[];
 }

-const Reasoning = memo<ReasoningProps>(({ content = '', duration, id }) => {
-  const isReasoning = useChatStore(aiChatSelectors.isMessageInReasoning(id));
-  const transitionMode = useUserStore(userGeneralSettingsSelectors.transitionMode);
+const Reasoning = memo<ReasoningProps>(
+  ({ content = '', duration, id, isMultimodal, tempDisplayContent }) => {
+    const isReasoning = useChatStore(aiChatSelectors.isMessageInReasoning(id));
+    const transitionMode = useUserStore(userGeneralSettingsSelectors.transitionMode);

-  return (
-    <Thinking
-      content={content}
-      duration={duration}
-      thinking={isReasoning}
-      thinkingAnimated={transitionMode === 'fadeIn' && isReasoning}
-    />
-  );
-});
+    const parts = tempDisplayContent || deserializeParts(content);
+
+    // If parts are provided, render multimodal content
+    const thinkingContent = isMultimodal && parts ? <RichContentRenderer parts={parts} /> : content;
+
+    return (
+      <Thinking
+        content={thinkingContent}
+        duration={duration}
+        thinking={isReasoning}
+        thinkingAnimated={transitionMode === 'fadeIn' && isReasoning}
+      />
+    );
+  },
+);

 export default Reasoning;
--- a/src/features/ChatList/Messages/Assistant/index.tsx
+++ b/src/features/ChatList/Messages/Assistant/index.tsx
@@ -33,7 +33,7 @@ import { useDoubleClickEdit } from '../../hooks/useDoubleClickEdit';
 import { normalizeThinkTags, processWithArtifact } from '../../utils/markdown';
 import { AssistantActionsBar } from './Actions';
 import { AssistantMessageExtra } from './Extra';
-import { AssistantMessageContent } from './MessageContent';
+import { AssistantMessageBody } from './MessageBody';

 const rehypePlugins = markdownElements.map((element) => element.rehypePlugin).filter(Boolean);
 const remarkPlugins = markdownElements.map((element) => element.remarkPlugin).filter(Boolean);
@@ -75,7 +75,7 @@ export const useStyles = createStyles(
          justify-content: ${placement === 'left' ? 'flex-end' : 'flex-start'};
        `,
        editing &&
-        css`
+          css`
            pointer-events: none !important;
            opacity: 0 !important;
          `,
@@ -88,7 +88,6 @@ export const useStyles = createStyles(
          width: 100%;
          max-width: 100vw;
          padding-block: 24px 12px;
-          padding-inline: 12px;

          @supports (content-visibility: auto) {
            contain-intrinsic-size: auto 100lvh;
@@ -305,9 +304,13 @@ const AssistantMessage = memo<AssistantMessageProps>(

    const renderMessage = useCallback(
      (editableContent: ReactNode) => (
-        <AssistantMessageContent {...item} editableContent={editableContent} />
+        <AssistantMessageBody
+          {...item}
+          editableContent={editableContent}
+          markdownProps={markdownProps}
+        />
      ),
-      [item],
+      [item, markdownProps],
    );
    const errorMessage = <ErrorMessageExtra data={item} />;

--- a/src/features/ChatList/Messages/Default.tsx
+++ b/src/features/ChatList/Messages/Default.tsx
@@ -6,25 +6,22 @@ import { LOADING_FLAT } from '@/const/message';
 import { useChatStore } from '@/store/chat';
 import { messageStateSelectors } from '@/store/chat/selectors';

-export const MessageContentClassName = 'msg_content_flag'
+export const MessageContentClassName = 'msg_content_flag';

 export const DefaultMessage = memo<
  UIChatMessage & {
    addIdOnDOM?: boolean;
    editableContent: ReactNode;
+    hasImages?: boolean;
    isToolCallGenerating?: boolean;
  }
->(({ id, editableContent, content, isToolCallGenerating, addIdOnDOM = true }) => {
+>(({ id, editableContent, content, isToolCallGenerating, addIdOnDOM = true, hasImages }) => {
  const editing = useChatStore(messageStateSelectors.isMessageEditing(id));

  if (isToolCallGenerating) return;

-  if (!content) return <BubblesLoading />;
+  if (!content && !hasImages) return <BubblesLoading />;
  if (content === LOADING_FLAT && !editing) return <BubblesLoading />;

  return <div id={addIdOnDOM ? id : undefined}>{editableContent}</div>;
 });
-
-export const DefaultBelowMessage = memo<UIChatMessage>(() => {
-  return null;
-});
--- a/src/features/ChatList/components/RichContentRenderer.tsx
+++ b/src/features/ChatList/components/RichContentRenderer.tsx
@@ -0,0 +1,35 @@
+import { Image, Markdown } from '@lobehub/ui';
+import { memo } from 'react';
+import { Flexbox } from 'react-layout-kit';
+
+import { MessageContentPart } from '@/types/index';
+
+interface RichContentRendererProps {
+  parts: MessageContentPart[];
+}
+
+export const RichContentRenderer = memo<RichContentRendererProps>(({ parts }) => {
+  return (
+    <Flexbox gap={8}>
+      {parts.map((part, index) => {
+        if (part.type === 'text') {
+          return (
+            <Markdown key={index} variant="chat">
+              {part.text}
+            </Markdown>
+          );
+        }
+
+        if (part.type === 'image') {
+          return (
+            <Image key={index} src={part.image} style={{ borderRadius: 8, maxWidth: '100%' }} />
+          );
+        }
+
+        return null;
+      })}
+    </Flexbox>
+  );
+});
+
+RichContentRenderer.displayName = 'RichContentRenderer';
--- a/src/store/chat/slices/aiChat/actions/streamingExecutor.ts
+++ b/src/store/chat/slices/aiChat/actions/streamingExecutor.ts
@@ -5,14 +5,17 @@ import { isDesktop } from '@lobechat/const';
 import {
  ChatImageItem,
  ChatToolPayload,
+  MessageContentPart,
  MessageToolCall,
  ModelUsage,
  TraceNameMap,
  UIChatMessage,
 } from '@lobechat/types';
+import { serializePartsForStorage } from '@lobechat/utils';
 import debug from 'debug';
 import { t } from 'i18next';
 import { throttle } from 'lodash-es';
+import pMap from 'p-map';
 import { StateCreator } from 'zustand/vanilla';

 import { createAgentToolsEngine } from '@/helpers/toolEngineering';
@@ -272,14 +275,21 @@ export const streamingExecutor: StateCreator<
    let finalUsage;
    let msgTraceId: string | undefined;
    let output = '';
-    let thinking = '';
+
+    let thinkingContent = '';
    let thinkingStartAt: number;
-    let duration: number | undefined;
+    let thinkingDuration: number | undefined;
    let reasoningOperationId: string | undefined;
    let finishType: string | undefined;
    // to upload image
    const uploadTasks: Map<string, Promise<{ id?: string; url?: string }>> = new Map();

+    // Multimodal content parts
+    let contentParts: MessageContentPart[] = [];
+    let reasoningParts: MessageContentPart[] = [];
+    const contentImageUploads: Map<number, Promise<string>> = new Map();
+    const reasoningImageUploads: Map<number, Promise<string>> = new Map();
+
    // Throttle tool_calls updates to prevent excessive re-renders (max once per 300ms)
    const throttledUpdateToolCalls = throttle(
      (toolCalls: MessageToolCall[]) => {
@@ -344,7 +354,9 @@ export const streamingExecutor: StateCreator<
        if (uploadTasks.size > 0) {
          try {
            // 等待所有上传任务完成
-            const uploadResults = await Promise.all(uploadTasks.values());
+            const uploadResults = await pMap(Array.from(uploadTasks.values()), (task) => task, {
+              concurrency: 5,
+            });

            // 使用上传后的 S3 URL 替换原始图像数据
            finalImages = uploadResults.filter((i) => !!i.url) as ChatImageItem[];
@@ -353,6 +365,14 @@ export const streamingExecutor: StateCreator<
          }
        }

+        // Wait for all multimodal image uploads to complete
+        // Note: Arrays are already updated in-place when uploads complete
+        // Use Promise.allSettled to continue even if some uploads fail
+        await Promise.allSettled([
+          ...Array.from(contentImageUploads.values()),
+          ...Array.from(reasoningImageUploads.values()),
+        ]);
+
        let parsedToolCalls = toolCalls;
        if (parsedToolCalls && parsedToolCalls.length > 0) {
          // Flush any pending throttled updates before finalizing
@@ -384,18 +404,58 @@ export const streamingExecutor: StateCreator<
          operationId,
        );

+        // Check if there are any image parts
+        const hasContentImages = contentParts.some((part) => part.type === 'image');
+        const hasReasoningImages = reasoningParts.some((part) => part.type === 'image');
+
+        // Determine final content
+        // If has images, serialize contentParts; otherwise use accumulated output text
+        const finalContent = hasContentImages ? serializePartsForStorage(contentParts) : output;
+
+        const finalDuration =
+          thinkingDuration && !isNaN(thinkingDuration) ? thinkingDuration : undefined;
+
+        // Determine final reasoning content
+        // Priority: reasoningParts (multimodal) > thinkingContent (from reasoning_part text) > reasoning (from old reasoning event)
+        let finalReasoning: any = undefined;
+        if (hasReasoningImages) {
+          // Has images, use multimodal format
+          finalReasoning = {
+            content: serializePartsForStorage(reasoningParts),
+            duration: finalDuration,
+            isMultimodal: true,
+          };
+        } else if (thinkingContent) {
+          // Has text from reasoning_part but no images
+          finalReasoning = {
+            content: thinkingContent,
+            duration: finalDuration,
+          };
+        } else if (reasoning?.content) {
+          // Fallback to old reasoning event content
+          finalReasoning = {
+            ...reasoning,
+            duration: finalDuration,
+          };
+        }
+
        // update the content after fetch result
        await optimisticUpdateMessageContent(
          messageId,
-          content,
+          finalContent,
          {
            tools,
-            reasoning: !!reasoning
-              ? { ...reasoning, duration: duration && !isNaN(duration) ? duration : undefined }
-              : undefined,
+            reasoning: finalReasoning,
            search: !!grounding?.citations ? grounding : undefined,
            imageList: finalImages.length > 0 ? finalImages : undefined,
-            metadata: { ...usage, ...speed, performance: speed, usage, finishType: type },
+            metadata: {
+              ...usage,
+              ...speed,
+              performance: speed,
+              usage,
+              finishType: type,
+              ...(hasContentImages && { isMultimodal: true }),
+            },
          },
          { operationId },
        );
@@ -457,8 +517,8 @@ export const streamingExecutor: StateCreator<
            output += chunk.text;

            // if there is no duration, it means the end of reasoning
-            if (!duration) {
-              duration = Date.now() - thinkingStartAt;
+            if (!thinkingDuration) {
+              thinkingDuration = Date.now() - thinkingStartAt;

              // Complete reasoning operation if it exists
              if (reasoningOperationId) {
@@ -480,7 +540,9 @@ export const streamingExecutor: StateCreator<
                type: 'updateMessage',
                value: {
                  content: output,
-                  reasoning: !!thinking ? { content: thinking, duration } : undefined,
+                  reasoning: !!thinkingContent
+                    ? { content: thinkingContent, duration: thinkingDuration }
+                    : undefined,
                },
              },
              { operationId },
@@ -505,13 +567,178 @@ export const streamingExecutor: StateCreator<
              get().associateMessageWithOperation(messageId, reasoningOperationId);
            }

-            thinking += chunk.text;
+            thinkingContent += chunk.text;

            internal_dispatchMessage(
              {
                id: messageId,
                type: 'updateMessage',
-                value: { reasoning: { content: thinking } },
+                value: { reasoning: { content: thinkingContent } },
+              },
+              { operationId },
+            );
+            break;
+          }
+
+          case 'reasoning_part': {
+            // Start reasoning if not started
+            if (!thinkingStartAt) {
+              thinkingStartAt = Date.now();
+
+              const { operationId: reasoningOpId } = get().startOperation({
+                type: 'reasoning',
+                context: { sessionId, topicId, messageId },
+                parentOperationId: operationId,
+              });
+              reasoningOperationId = reasoningOpId;
+              get().associateMessageWithOperation(messageId, reasoningOperationId);
+            }
+
+            const { partType, content: partContent, mimeType } = chunk;
+
+            if (partType === 'text') {
+              const lastPart = reasoningParts.at(-1);
+
+              // If last part is also text, merge chunks together
+              if (lastPart?.type === 'text') {
+                reasoningParts = [
+                  ...reasoningParts.slice(0, -1),
+                  { type: 'text', text: lastPart.text + partContent },
+                ];
+              } else {
+                // Create new text part (first chunk, may contain thoughtSignature)
+                reasoningParts = [...reasoningParts, { type: 'text', text: partContent }];
+              }
+              thinkingContent += partContent;
+            } else if (partType === 'image') {
+              // Image part - create new array to avoid mutation
+              const tempImage = `data:${mimeType};base64,${partContent}`;
+              const partIndex = reasoningParts.length;
+              const newPart: MessageContentPart = { type: 'image', image: tempImage };
+              reasoningParts = [...reasoningParts, newPart];
+
+              // Start upload task and update array when done
+              const uploadTask = getFileStoreState()
+                .uploadBase64FileWithProgress(tempImage)
+                .then((file) => {
+                  const url = file?.url || tempImage;
+                  // Replace the part at index by creating a new array
+                  const updatedParts = [...reasoningParts];
+                  updatedParts[partIndex] = { type: 'image', image: url };
+                  reasoningParts = updatedParts;
+                  return url;
+                })
+                .catch((error) => {
+                  console.error('[reasoning_part] Image upload failed:', error);
+                  return tempImage;
+                });
+
+              reasoningImageUploads.set(partIndex, uploadTask);
+            }
+
+            // Real-time update with display format
+            // Check if there are any image parts to determine if it's multimodal
+            const hasReasoningImages = reasoningParts.some((part) => part.type === 'image');
+
+            internal_dispatchMessage(
+              {
+                id: messageId,
+                type: 'updateMessage',
+                value: {
+                  reasoning: hasReasoningImages
+                    ? { tempDisplayContent: reasoningParts, isMultimodal: true }
+                    : { content: thinkingContent },
+                },
+              },
+              { operationId },
+            );
+            break;
+          }
+
+          case 'content_part': {
+            const { partType, content: partContent, mimeType } = chunk;
+
+            // End reasoning when content starts
+            if (!thinkingDuration && reasoningOperationId) {
+              thinkingDuration = Date.now() - thinkingStartAt;
+              get().completeOperation(reasoningOperationId);
+              reasoningOperationId = undefined;
+            }
+
+            if (partType === 'text') {
+              const lastPart = contentParts.at(-1);
+
+              // If last part is also text, merge chunks together
+              if (lastPart?.type === 'text') {
+                contentParts = [
+                  ...contentParts.slice(0, -1),
+                  { type: 'text', text: lastPart.text + partContent },
+                ];
+              } else {
+                // Create new text part (first chunk, may contain thoughtSignature)
+                contentParts = [...contentParts, { type: 'text', text: partContent }];
+              }
+              output += partContent;
+            } else if (partType === 'image') {
+              // Image part - create new array to avoid mutation
+              const tempImage = `data:${mimeType};base64,${partContent}`;
+              const partIndex = contentParts.length;
+              const newPart: MessageContentPart = {
+                type: 'image',
+                image: tempImage,
+              };
+              contentParts = [...contentParts, newPart];
+
+              // Start upload task and update array when done
+              const uploadTask = getFileStoreState()
+                .uploadBase64FileWithProgress(tempImage)
+                .then((file) => {
+                  const url = file?.url || tempImage;
+                  // Replace the part at index by creating a new array
+                  const updatedParts = [...contentParts];
+                  updatedParts[partIndex] = {
+                    type: 'image',
+                    image: url,
+                  };
+                  contentParts = updatedParts;
+                  return url;
+                })
+                .catch((error) => {
+                  console.error('[content_part] Image upload failed:', error);
+                  return tempImage;
+                });
+
+              contentImageUploads.set(partIndex, uploadTask);
+            }
+
+            // Real-time update with display format
+            // Check if there are any image parts to determine if it's multimodal
+            const hasContentImages = contentParts.some((part) => part.type === 'image');
+
+            const hasReasoningImages = reasoningParts.some((part) => part.type === 'image');
+
+            internal_dispatchMessage(
+              {
+                id: messageId,
+                type: 'updateMessage',
+                value: {
+                  content: output,
+                  reasoning: hasReasoningImages
+                    ? {
+                        tempDisplayContent: reasoningParts,
+                        isMultimodal: true,
+                        duration: thinkingDuration,
+                      }
+                    : !!thinkingContent
+                      ? { content: thinkingContent, duration: thinkingDuration }
+                      : undefined,
+                  ...(hasContentImages && {
+                    metadata: {
+                      isMultimodal: true,
+                      tempDisplayContent: serializePartsForStorage(contentParts),
+                    },
+                  }),
+                },
              },
              { operationId },
            );
@@ -525,8 +752,8 @@ export const streamingExecutor: StateCreator<
            isFunctionCall = true;

            // Complete reasoning operation if it exists
-            if (!duration && reasoningOperationId) {
-              duration = Date.now() - thinkingStartAt;
+            if (!thinkingDuration && reasoningOperationId) {
+              thinkingDuration = Date.now() - thinkingStartAt;
              get().completeOperation(reasoningOperationId);
              reasoningOperationId = undefined;
            }
@@ -535,8 +762,8 @@ export const streamingExecutor: StateCreator<

          case 'stop': {
            // Complete reasoning operation when receiving stop signal
-            if (!duration && reasoningOperationId) {
-              duration = Date.now() - thinkingStartAt;
+            if (!thinkingDuration && reasoningOperationId) {
+              thinkingDuration = Date.now() - thinkingStartAt;
              get().completeOperation(reasoningOperationId);
              reasoningOperationId = undefined;
            }