From 61ecf1bc1ba278087fb83e61f784b5065fff64b7 Mon Sep 17 00:00:00 2001
From: myhloli <moe@myhloli.com>
Date: Fri, 20 Mar 2026 18:26:34 +0800
Subject: [PATCH] feat: refactor algorithm block handling to use code block
 type and enhance markdown rendering

---
 .../backend/pipeline/pipeline_magic_model.py  | 35 ++++++---
 .../pipeline_middle_json_mkcontent.py         | 71 +++++++++++++++----
 mineru/utils/enum_class.py                    |  3 +-
 3 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/mineru/backend/pipeline/pipeline_magic_model.py b/mineru/backend/pipeline/pipeline_magic_model.py
index e8cc40fc..fde2ce85 100644
--- a/mineru/backend/pipeline/pipeline_magic_model.py
+++ b/mineru/backend/pipeline/pipeline_magic_model.py
@@ -1,4 +1,5 @@
 from mineru.backend.pipeline.para_split import ListLineTag
+from mineru.backend.pipeline.pipeline_middle_json_mkcontent import _merge_para_text
 from mineru.utils.boxbase import (
     bbox_center_distance,
     bbox_distance,
@@ -6,6 +7,7 @@ from mineru.utils.boxbase import (
     calculate_overlap_area_in_bbox1_area_ratio,
 )
 from mineru.utils.enum_class import ContentType, BlockType
+from mineru.utils.guess_suffix_or_lang import guess_language_by_text
 from mineru.utils.span_block_fix import merge_spans_to_vertical_line, vertical_line_sort_spans_from_top_to_bottom, \
     merge_spans_to_line, line_sort_spans_by_left_to_right
 from mineru.utils.span_pre_proc import txt_spans_extract
@@ -15,7 +17,7 @@ class MagicModel:
 
     PP_DOCLAYOUT_V2_LABELS_TO_BLOCK_TYPES = {
         "abstract": BlockType.ABSTRACT,
-        "algorithm": BlockType.ALGORITHM,
+        "algorithm": BlockType.CODE,
         "aside_text": BlockType.ASIDE_TEXT,
         "chart": BlockType.CHART,
         "content": BlockType.INDEX,
@@ -39,7 +41,7 @@ class MagicModel:
         "vision_footnote": BlockType.FOOTNOTE,
     }
 
-    VISUAL_MAIN_TYPES = (BlockType.IMAGE, BlockType.TABLE, BlockType.CHART, BlockType.ALGORITHM)
+    VISUAL_MAIN_TYPES = (BlockType.IMAGE, BlockType.TABLE, BlockType.CHART, BlockType.CODE)
     VISUAL_CHILD_TYPES = (BlockType.CAPTION, BlockType.FOOTNOTE)
     VISUAL_TYPE_MAPPING = {
         BlockType.IMAGE: {
@@ -57,10 +59,10 @@ class MagicModel:
             "caption": BlockType.CHART_CAPTION,
             "footnote": BlockType.CHART_FOOTNOTE,
         },
-        BlockType.ALGORITHM: {
-            "body": BlockType.ALGORITHM_BODY,
-            "caption": BlockType.ALGORITHM_CAPTION,
-            "footnote": BlockType.ALGORITHM_FOOTNOTE,
+        BlockType.CODE: {
+            "body": BlockType.CODE_BODY,
+            "caption": BlockType.CODE_CAPTION,
+            "footnote": BlockType.CODE_FOOTNOTE,
         }
     }
 
@@ -129,9 +131,18 @@ class MagicModel:
             block_lines = merge_spans_to_line(block['spans'])
             sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
 
-        if block["type"] == BlockType.ALGORITHM:
+        if block["type"] == BlockType.CODE:
             for line in sort_block_lines:
                 line[ListLineTag.IS_LIST_START_LINE] = True
+            code_content = _merge_para_text(
+                {'lines': sort_block_lines},
+                False,
+                '\n'
+            )
+            guess_lang = guess_language_by_text(code_content)
+            if guess_lang not in ["txt", "unknown"]:
+                block["sub_type"] = "code"
+                block["guess_lang"] = guess_lang
 
         block['lines'] = sort_block_lines
         del block['spans']
@@ -172,6 +183,14 @@ class MagicModel:
             ]:
                 self.discarded_blocks.append(block)
             else:
+                # 单独处理code block
+                if block["type"] in [BlockType.CODE]:
+                    for sub_block in block["blocks"]:
+                        if sub_block["type"] == BlockType.CODE_BODY:
+                            block["sub_type"] = sub_block.pop("sub_type", "algorithm")
+                            if block["sub_type"] == "code":
+                                block["guess_lang"] = sub_block.pop("guess_lang", "txt")
+
                 self.preproc_blocks.append(block)
 
     def __build_page_blocks(self):
@@ -179,7 +198,7 @@ class MagicModel:
         for block in self.page_blocks:
             if block["type"] in [
                 BlockType.ABSTRACT,
-                BlockType.ALGORITHM,
+                BlockType.CODE,
                 BlockType.ASIDE_TEXT,
                 BlockType.INDEX,
                 BlockType.DOC_TITLE,
diff --git a/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py b/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
index dbd69de5..49a3b136 100644
--- a/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
+++ b/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
@@ -48,7 +48,7 @@ def make_blocks_to_markdown(paras_of_layout,
                 continue
             elif mode == MakeMode.MM_MD:
                 para_text = merge_visual_blocks_to_markdown(para_block, img_buket_path)
-        elif para_type == BlockType.ALGORITHM:
+        elif para_type == BlockType.CODE:
             para_text = merge_visual_blocks_to_markdown(para_block)
 
         if para_text.strip() == '':
@@ -60,13 +60,14 @@ def make_blocks_to_markdown(paras_of_layout,
 
 
 def merge_visual_blocks_to_markdown(para_block, img_buket_path=''):
-    # 将 image/table/algorithm 这类视觉块的子 block 按阅读顺序拼接成 markdown。
+    # 将 image/table/chart/code 这类视觉块的子 block 按阅读顺序拼接成 markdown。
     # 这里不再写死 caption/body/footnote 的优先级，而是先展开成 segment，
     # 再根据 markdown_line / html_block 两类片段决定分隔方式。
     rendered_segments = []
 
     for block in get_blocks_in_index_order(para_block.get('blocks', [])):
-        rendered_segments.extend(render_visual_block_segments(block, img_buket_path))
+        render_block = _inherit_parent_code_render_metadata(block, para_block)
+        rendered_segments.extend(render_visual_block_segments(render_block, img_buket_path))
 
     para_text = ''
     prev_segment_kind = None
@@ -91,6 +92,27 @@ def get_blocks_in_index_order(blocks):
     ]
 
 
+def _inherit_parent_code_render_metadata(block, parent_block):
+    # pipeline_magic_model 会把 code_body 的 sub_type/guess_lang 提升到父 code block。
+    # markdown 渲染 code_body 时需要把这两个字段临时透传回来，但不能修改原始输入。
+    if block.get('type') != BlockType.CODE_BODY:
+        return block
+    if parent_block.get('type') != BlockType.CODE:
+        return block
+
+    needs_sub_type = 'sub_type' not in block and 'sub_type' in parent_block
+    needs_guess_lang = 'guess_lang' not in block and 'guess_lang' in parent_block
+    if not needs_sub_type and not needs_guess_lang:
+        return block
+
+    render_block = dict(block)
+    if needs_sub_type:
+        render_block['sub_type'] = parent_block['sub_type']
+    if needs_guess_lang:
+        render_block['guess_lang'] = parent_block['guess_lang']
+    return render_block
+
+
 def render_visual_block_segments(block, img_buket_path=''):
     # 将单个视觉子 block 渲染成一个或多个 segment。
     # 文本类子块统一输出 markdown_line；
@@ -102,9 +124,9 @@ def render_visual_block_segments(block, img_buket_path=''):
         BlockType.IMAGE_FOOTNOTE,
         BlockType.TABLE_CAPTION,
         BlockType.TABLE_FOOTNOTE,
-        BlockType.ALGORITHM_BODY,
-        BlockType.ALGORITHM_CAPTION,
-        BlockType.ALGORITHM_FOOTNOTE,
+        BlockType.CODE_BODY,
+        BlockType.CODE_CAPTION,
+        BlockType.CODE_FOOTNOTE,
         BlockType.CHART_CAPTION,
         BlockType.CHART_FOOTNOTE,
     ]:
@@ -176,6 +198,22 @@ CJK_LANGS = {'zh', 'ja', 'ko'}
 
 
 def merge_para_with_text(para_block):
+    if _is_fenced_code_block(para_block):
+        code_text = _merge_para_text(
+            para_block,
+            escape_markdown=False,
+            list_line_break='\n',
+        )
+        if not code_text:
+            return ''
+        code_text = '\n'.join(line.rstrip() for line in code_text.split('\n'))
+        guess_lang = para_block.get('guess_lang', 'txt') or 'txt'
+        return f"```{guess_lang}\n{code_text}\n```"
+
+    return _merge_para_text(para_block)
+
+
+def _merge_para_text(para_block, escape_markdown=True, list_line_break='  \n'):
     # 将普通文本段落 block 渲染成 markdown 字符串。
     # 处理流程分为三层：
     # 1. 先收集文本内容做语言检测
@@ -185,12 +223,12 @@ def merge_para_with_text(para_block):
     para_parts = []
 
     for line_idx, line in enumerate(para_block['lines']):
-        line_prefix = _line_prefix(line_idx, line)
+        line_prefix = _line_prefix(line_idx, line, list_line_break)
         if line_prefix:
             para_parts.append(line_prefix)
 
         for span_idx, span in enumerate(line['spans']):
-            rendered_span = _render_span(span)
+            rendered_span = _render_span(span, escape_markdown=escape_markdown)
             if rendered_span is None:
                 continue
 
@@ -211,6 +249,13 @@ def merge_para_with_text(para_block):
     return ''.join(para_parts).rstrip()
 
 
+def _is_fenced_code_block(para_block):
+    return (
+        para_block.get('type') == BlockType.CODE_BODY
+        and para_block.get('sub_type') == BlockType.CODE
+    )
+
+
 def _collect_text_for_lang_detection(para_block):
     # 只收集 TEXT span 的内容，用于语言检测。
     # 这里会先做全角转半角，但不会修改原始输入数据。
@@ -228,14 +273,16 @@ def _normalize_text_content(content):
     return full_to_half_exclude_marks(content or '')
 
 
-def _render_span(span):
+def _render_span(span, escape_markdown=True):
     # 将单个 span 渲染成 markdown 片段。
     # 这里只负责“渲染成什么文本”，不决定后面是否补空格。
     span_type = span['type']
     content = ''
 
     if span_type == ContentType.TEXT:
-        content = escape_special_markdown_char(_normalize_text_content(span.get('content', '')))
+        content = _normalize_text_content(span.get('content', ''))
+        if escape_markdown:
+            content = escape_special_markdown_char(content)
     elif span_type == ContentType.INLINE_EQUATION:
         if span.get('content', ''):
             content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
@@ -283,11 +330,11 @@ def _join_rendered_span(para_block, block_lang, line, line_idx, span_idx, span_t
     return content, ' '
 
 
-def _line_prefix(line_idx, line):
+def _line_prefix(line_idx, line, list_line_break='  \n'):
     # 处理进入新 list item 前的 block 级换行。
     # 这里保留历史语义：list 起始行前插入一个 hard break。
     if line_idx >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
-        return '  \n'
+        return list_line_break
     return ''
 
 
diff --git a/mineru/utils/enum_class.py b/mineru/utils/enum_class.py
index 98352eb6..a45aee39 100644
--- a/mineru/utils/enum_class.py
+++ b/mineru/utils/enum_class.py
@@ -7,7 +7,6 @@ class BlockType:
     IMAGE_BODY = 'image_body'
     TABLE_BODY = 'table_body'
     CHART_BODY = 'chart_body'
-    ALGORITHM_BODY = 'algorithm_body'
     CAPTION = 'caption'  # generic caption type (e.g., for Word documents)
     IMAGE_CAPTION = 'image_caption'
     TABLE_CAPTION = 'table_caption'
@@ -17,7 +16,6 @@ class BlockType:
     IMAGE_FOOTNOTE = 'image_footnote'
     TABLE_FOOTNOTE = 'table_footnote'
     CHART_FOOTNOTE = 'chart_footnote'
-    ALGORITHM_FOOTNOTE = 'algorithm_footnote'
     TEXT = 'text'
     TITLE = 'title'
     INTERLINE_EQUATION = 'interline_equation'
@@ -30,6 +28,7 @@ class BlockType:
     CODE = "code"
     CODE_BODY = "code_body"
     CODE_CAPTION = "code_caption"
+    CODE_FOOTNOTE = "code_footnote"
     ALGORITHM = "algorithm"
     REF_TEXT = "ref_text"
     PHONETIC = "phonetic"