feat: refactor algorithm block handling to use code block type and enhance markdown rendering

2026-03-27 11:08:32 +07:00 · 2026-03-20 18:26:34 +08:00
parent 882a0ee72f
commit 61ecf1bc1b
3 changed files with 87 additions and 22 deletions
--- a/mineru/backend/pipeline/pipeline_magic_model.py
+++ b/mineru/backend/pipeline/pipeline_magic_model.py
@@ -1,4 +1,5 @@
 from mineru.backend.pipeline.para_split import ListLineTag
+from mineru.backend.pipeline.pipeline_middle_json_mkcontent import _merge_para_text
 from mineru.utils.boxbase import (
    bbox_center_distance,
    bbox_distance,
@@ -6,6 +7,7 @@ from mineru.utils.boxbase import (
    calculate_overlap_area_in_bbox1_area_ratio,
 )
 from mineru.utils.enum_class import ContentType, BlockType
+from mineru.utils.guess_suffix_or_lang import guess_language_by_text
 from mineru.utils.span_block_fix import merge_spans_to_vertical_line, vertical_line_sort_spans_from_top_to_bottom, \
    merge_spans_to_line, line_sort_spans_by_left_to_right
 from mineru.utils.span_pre_proc import txt_spans_extract
@@ -15,7 +17,7 @@ class MagicModel:

    PP_DOCLAYOUT_V2_LABELS_TO_BLOCK_TYPES = {
        "abstract": BlockType.ABSTRACT,
-        "algorithm": BlockType.ALGORITHM,
+        "algorithm": BlockType.CODE,
        "aside_text": BlockType.ASIDE_TEXT,
        "chart": BlockType.CHART,
        "content": BlockType.INDEX,
@@ -39,7 +41,7 @@ class MagicModel:
        "vision_footnote": BlockType.FOOTNOTE,
    }

-    VISUAL_MAIN_TYPES = (BlockType.IMAGE, BlockType.TABLE, BlockType.CHART, BlockType.ALGORITHM)
+    VISUAL_MAIN_TYPES = (BlockType.IMAGE, BlockType.TABLE, BlockType.CHART, BlockType.CODE)
    VISUAL_CHILD_TYPES = (BlockType.CAPTION, BlockType.FOOTNOTE)
    VISUAL_TYPE_MAPPING = {
        BlockType.IMAGE: {
@@ -57,10 +59,10 @@ class MagicModel:
            "caption": BlockType.CHART_CAPTION,
            "footnote": BlockType.CHART_FOOTNOTE,
        },
-        BlockType.ALGORITHM: {
-            "body": BlockType.ALGORITHM_BODY,
-            "caption": BlockType.ALGORITHM_CAPTION,
-            "footnote": BlockType.ALGORITHM_FOOTNOTE,
+        BlockType.CODE: {
+            "body": BlockType.CODE_BODY,
+            "caption": BlockType.CODE_CAPTION,
+            "footnote": BlockType.CODE_FOOTNOTE,
        }
    }

@@ -129,9 +131,18 @@ class MagicModel:
            block_lines = merge_spans_to_line(block['spans'])
            sort_block_lines = line_sort_spans_by_left_to_right(block_lines)

-        if block["type"] == BlockType.ALGORITHM:
+        if block["type"] == BlockType.CODE:
            for line in sort_block_lines:
                line[ListLineTag.IS_LIST_START_LINE] = True
+            code_content = _merge_para_text(
+                {'lines': sort_block_lines},
+                False,
+                '\n'
+            )
+            guess_lang = guess_language_by_text(code_content)
+            if guess_lang not in ["txt", "unknown"]:
+                block["sub_type"] = "code"
+                block["guess_lang"] = guess_lang

        block['lines'] = sort_block_lines
        del block['spans']
@@ -172,6 +183,14 @@ class MagicModel:
            ]:
                self.discarded_blocks.append(block)
            else:
+                # 单独处理code block
+                if block["type"] in [BlockType.CODE]:
+                    for sub_block in block["blocks"]:
+                        if sub_block["type"] == BlockType.CODE_BODY:
+                            block["sub_type"] = sub_block.pop("sub_type", "algorithm")
+                            if block["sub_type"] == "code":
+                                block["guess_lang"] = sub_block.pop("guess_lang", "txt")
+
                self.preproc_blocks.append(block)

    def __build_page_blocks(self):
@@ -179,7 +198,7 @@ class MagicModel:
        for block in self.page_blocks:
            if block["type"] in [
                BlockType.ABSTRACT,
-                BlockType.ALGORITHM,
+                BlockType.CODE,
                BlockType.ASIDE_TEXT,
                BlockType.INDEX,
                BlockType.DOC_TITLE,
--- a/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
+++ b/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
@@ -48,7 +48,7 @@ def make_blocks_to_markdown(paras_of_layout,
                continue
            elif mode == MakeMode.MM_MD:
                para_text = merge_visual_blocks_to_markdown(para_block, img_buket_path)
-        elif para_type == BlockType.ALGORITHM:
+        elif para_type == BlockType.CODE:
            para_text = merge_visual_blocks_to_markdown(para_block)

        if para_text.strip() == '':
@@ -60,13 +60,14 @@ def make_blocks_to_markdown(paras_of_layout,


 def merge_visual_blocks_to_markdown(para_block, img_buket_path=''):
-    # 将 image/table/algorithm 这类视觉块的子 block 按阅读顺序拼接成 markdown。
+    # 将 image/table/chart/code 这类视觉块的子 block 按阅读顺序拼接成 markdown。
    # 这里不再写死 caption/body/footnote 的优先级，而是先展开成 segment，
    # 再根据 markdown_line / html_block 两类片段决定分隔方式。
    rendered_segments = []

    for block in get_blocks_in_index_order(para_block.get('blocks', [])):
-        rendered_segments.extend(render_visual_block_segments(block, img_buket_path))
+        render_block = _inherit_parent_code_render_metadata(block, para_block)
+        rendered_segments.extend(render_visual_block_segments(render_block, img_buket_path))

    para_text = ''
    prev_segment_kind = None
@@ -91,6 +92,27 @@ def get_blocks_in_index_order(blocks):
    ]


+def _inherit_parent_code_render_metadata(block, parent_block):
+    # pipeline_magic_model 会把 code_body 的 sub_type/guess_lang 提升到父 code block。
+    # markdown 渲染 code_body 时需要把这两个字段临时透传回来，但不能修改原始输入。
+    if block.get('type') != BlockType.CODE_BODY:
+        return block
+    if parent_block.get('type') != BlockType.CODE:
+        return block
+
+    needs_sub_type = 'sub_type' not in block and 'sub_type' in parent_block
+    needs_guess_lang = 'guess_lang' not in block and 'guess_lang' in parent_block
+    if not needs_sub_type and not needs_guess_lang:
+        return block
+
+    render_block = dict(block)
+    if needs_sub_type:
+        render_block['sub_type'] = parent_block['sub_type']
+    if needs_guess_lang:
+        render_block['guess_lang'] = parent_block['guess_lang']
+    return render_block
+
+
 def render_visual_block_segments(block, img_buket_path=''):
    # 将单个视觉子 block 渲染成一个或多个 segment。
    # 文本类子块统一输出 markdown_line；
@@ -102,9 +124,9 @@ def render_visual_block_segments(block, img_buket_path=''):
        BlockType.IMAGE_FOOTNOTE,
        BlockType.TABLE_CAPTION,
        BlockType.TABLE_FOOTNOTE,
-        BlockType.ALGORITHM_BODY,
-        BlockType.ALGORITHM_CAPTION,
-        BlockType.ALGORITHM_FOOTNOTE,
+        BlockType.CODE_BODY,
+        BlockType.CODE_CAPTION,
+        BlockType.CODE_FOOTNOTE,
        BlockType.CHART_CAPTION,
        BlockType.CHART_FOOTNOTE,
    ]:
@@ -176,6 +198,22 @@ CJK_LANGS = {'zh', 'ja', 'ko'}


 def merge_para_with_text(para_block):
+    if _is_fenced_code_block(para_block):
+        code_text = _merge_para_text(
+            para_block,
+            escape_markdown=False,
+            list_line_break='\n',
+        )
+        if not code_text:
+            return ''
+        code_text = '\n'.join(line.rstrip() for line in code_text.split('\n'))
+        guess_lang = para_block.get('guess_lang', 'txt') or 'txt'
+        return f"```{guess_lang}\n{code_text}\n```"
+
+    return _merge_para_text(para_block)
+
+
+def _merge_para_text(para_block, escape_markdown=True, list_line_break='  \n'):
    # 将普通文本段落 block 渲染成 markdown 字符串。
    # 处理流程分为三层：
    # 1. 先收集文本内容做语言检测
@@ -185,12 +223,12 @@ def merge_para_with_text(para_block):
    para_parts = []

    for line_idx, line in enumerate(para_block['lines']):
-        line_prefix = _line_prefix(line_idx, line)
+        line_prefix = _line_prefix(line_idx, line, list_line_break)
        if line_prefix:
            para_parts.append(line_prefix)

        for span_idx, span in enumerate(line['spans']):
-            rendered_span = _render_span(span)
+            rendered_span = _render_span(span, escape_markdown=escape_markdown)
            if rendered_span is None:
                continue

@@ -211,6 +249,13 @@ def merge_para_with_text(para_block):
    return ''.join(para_parts).rstrip()


+def _is_fenced_code_block(para_block):
+    return (
+        para_block.get('type') == BlockType.CODE_BODY
+        and para_block.get('sub_type') == BlockType.CODE
+    )
+
+
 def _collect_text_for_lang_detection(para_block):
    # 只收集 TEXT span 的内容，用于语言检测。
    # 这里会先做全角转半角，但不会修改原始输入数据。
@@ -228,14 +273,16 @@ def _normalize_text_content(content):
    return full_to_half_exclude_marks(content or '')


-def _render_span(span):
+def _render_span(span, escape_markdown=True):
    # 将单个 span 渲染成 markdown 片段。
    # 这里只负责“渲染成什么文本”，不决定后面是否补空格。
    span_type = span['type']
    content = ''

    if span_type == ContentType.TEXT:
-        content = escape_special_markdown_char(_normalize_text_content(span.get('content', '')))
+        content = _normalize_text_content(span.get('content', ''))
+        if escape_markdown:
+            content = escape_special_markdown_char(content)
    elif span_type == ContentType.INLINE_EQUATION:
        if span.get('content', ''):
            content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
@@ -283,11 +330,11 @@ def _join_rendered_span(para_block, block_lang, line, line_idx, span_idx, span_t
    return content, ' '


-def _line_prefix(line_idx, line):
+def _line_prefix(line_idx, line, list_line_break='  \n'):
    # 处理进入新 list item 前的 block 级换行。
    # 这里保留历史语义：list 起始行前插入一个 hard break。
    if line_idx >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
-        return '  \n'
+        return list_line_break
    return ''


--- a/mineru/utils/enum_class.py
+++ b/mineru/utils/enum_class.py
@@ -7,7 +7,6 @@ class BlockType:
    IMAGE_BODY = 'image_body'
    TABLE_BODY = 'table_body'
    CHART_BODY = 'chart_body'
-    ALGORITHM_BODY = 'algorithm_body'
    CAPTION = 'caption'  # generic caption type (e.g., for Word documents)
    IMAGE_CAPTION = 'image_caption'
    TABLE_CAPTION = 'table_caption'
@@ -17,7 +16,6 @@ class BlockType:
    IMAGE_FOOTNOTE = 'image_footnote'
    TABLE_FOOTNOTE = 'table_footnote'
    CHART_FOOTNOTE = 'chart_footnote'
-    ALGORITHM_FOOTNOTE = 'algorithm_footnote'
    TEXT = 'text'
    TITLE = 'title'
    INTERLINE_EQUATION = 'interline_equation'
@@ -30,6 +28,7 @@ class BlockType:
    CODE = "code"
    CODE_BODY = "code_body"
    CODE_CAPTION = "code_caption"
+    CODE_FOOTNOTE = "code_footnote"
    ALGORITHM = "algorithm"
    REF_TEXT = "ref_text"
    PHONETIC = "phonetic"