🐛 fix(stream): update event handling to use 'text' instead of 'content_part' in gemini 2.5 models (#11235)

🐛 fix(stream): update event handling to use 'text' instead of 'content_part' in Google AI stream
2026-03-27 13:29:15 +07:00 · 2026-01-31 22:43:18 +08:00
parent 338df4baf9
commit a76a630f28
2 changed files with 55 additions and 17 deletions
--- a/packages/model-runtime/src/core/streams/google/google-ai.test.ts
+++ b/packages/model-runtime/src/core/streams/google/google-ai.test.ts
@@ -251,16 +251,16 @@ describe('GoogleGenerativeAIStream', () => {
      expect(chunks).toEqual(
        [
          'id: chat_1',
-          'event: content_part',
-          'data: {"content":"234","partType":"text"}\n',
+          'event: text',
+          'data: "234"\n',

          'id: chat_1',
          'event: text',
          'data: ""\n',

          'id: chat_1',
-          'event: content_part',
-          `data: {"content":"567890\\n","partType":"text"}\n`,
+          'event: text',
+          `data: "567890\\n"\n`,
          // stop
          'id: chat_1',
          'event: stop',
@@ -384,12 +384,12 @@ describe('GoogleGenerativeAIStream', () => {
          `data: {"content":"**Finalizing Interpretation**\\n\\n","inReasoning":true,"partType":"text"}\n`,

          'id: chat_1',
-          'event: content_part',
-          `data: {"content":"简单来说，","partType":"text"}\n`,
+          'event: text',
+          'data: "简单来说，"\n',

          'id: chat_1',
-          'event: content_part',
-          `data: {"content":"文本内容。","partType":"text"}\n`,
+          'event: text',
+          'data: "文本内容。"\n',
          // stop
          'id: chat_1',
          'event: stop',
@@ -471,12 +471,12 @@ describe('GoogleGenerativeAIStream', () => {
      expect(chunks).toEqual(
        [
          'id: chat_1',
-          'event: content_part',
-          'data: {"content":"234","partType":"text"}\n',
+          'event: text',
+          'data: "234"\n',

          'id: chat_1',
-          'event: content_part',
-          `data: {"content":"567890\\n","partType":"text"}\n`,
+          'event: text',
+          'data: "567890\\n"\n',
          // stop
          'id: chat_1',
          'event: stop',
@@ -1103,7 +1103,7 @@ describe('GoogleGenerativeAIStream', () => {
              content: {
                parts: [
                  {
-                    text: '**Planning the Solution**\n\nI\'m solidifying my plan...',
+                    text: "**Planning the Solution**\n\nI'm solidifying my plan...",
                    thought: true,
                  },
                ],
@@ -1901,5 +1901,46 @@ describe('GoogleGenerativeAIStream', () => {
        ].map((i) => i + '\n'),
      );
    });
+
+    it('should NOT use multimodal processing if only thoughtsTokenCount is present in metadata but no thought parts', async () => {
+      vi.spyOn(uuidModule, 'nanoid').mockReturnValueOnce('1');
+
+      const data = [
+        {
+          candidates: [
+            {
+              content: {
+                parts: [{ text: 'Hello world' }],
+                role: 'model',
+              },
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 10,
+            candidatesTokenCount: 2,
+            totalTokenCount: 17,
+            thoughtsTokenCount: 5,
+          },
+          modelVersion: 'gemini-2.5-flash',
+        },
+      ];
+
+      const mockGoogleStream = new ReadableStream({
+        start(controller) {
+          data.forEach((item) => {
+            controller.enqueue(item);
+          });
+          controller.close();
+        },
+      });
+
+      const protocolStream = GoogleGenerativeAIStream(mockGoogleStream);
+      const chunks = await decodeStreamChunks(protocolStream);
+
+      // Should use 'text' event, not 'content_part'
+      expect(chunks).toContain('event: text\n');
+      expect(chunks).not.toContain('event: content_part\n');
+    });
  });
 });
--- a/packages/model-runtime/src/core/streams/google/index.ts
+++ b/packages/model-runtime/src/core/streams/google/index.ts
@@ -120,7 +120,6 @@ const transformGoogleGenerativeAIStream = (
    const hasReasoningParts = parts.some((p: any) => p.thought === true);
    const hasImageParts = parts.some((p: any) => p.inlineData);
    const hasThoughtSignature = parts.some((p: any) => p.thoughtSignature);
-    const hasThoughtsInMetadata = (usageMetadata as any)?.thoughtsTokenCount > 0;

    // Check model version to determine if new format should be used
    const modelVersion = (chunk as any).modelVersion || '';
@@ -144,8 +143,7 @@ const transformGoogleGenerativeAIStream = (
    // 1. There are reasoning parts in current chunk (thought: true)
    // 2. There are multiple parts with images (multimodal content)
    // 3. There are thoughtSignature in parts (reasoning metadata attached to content)
-    // 4. There is thoughtsTokenCount in metadata (indicates response contains reasoning)
-    // 5. This is Gemini 3 model with image generation (always use new format for consistency)
+    // 4. This is Gemini 3 model with image generation (always use new format for consistency)
    // BUT NOT for:
    // - The legacy single-image scenario
    // - Grounding metadata scenario (uses legacy text + grounding events)
@@ -153,7 +151,6 @@ const transformGoogleGenerativeAIStream = (
      (hasReasoningParts ||
        (hasImageParts && parts.length > 1) ||
        hasThoughtSignature ||
-        hasThoughtsInMetadata ||
        isGemini3Model) &&
      !isSingleImageWithFinish &&
      !hasGroundingMetadata;