mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat: refactor algorithm block handling to use code block type and enhance markdown rendering
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
from mineru.backend.pipeline.para_split import ListLineTag
|
||||
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import _merge_para_text
|
||||
from mineru.utils.boxbase import (
|
||||
bbox_center_distance,
|
||||
bbox_distance,
|
||||
@@ -6,6 +7,7 @@ from mineru.utils.boxbase import (
|
||||
calculate_overlap_area_in_bbox1_area_ratio,
|
||||
)
|
||||
from mineru.utils.enum_class import ContentType, BlockType
|
||||
from mineru.utils.guess_suffix_or_lang import guess_language_by_text
|
||||
from mineru.utils.span_block_fix import merge_spans_to_vertical_line, vertical_line_sort_spans_from_top_to_bottom, \
|
||||
merge_spans_to_line, line_sort_spans_by_left_to_right
|
||||
from mineru.utils.span_pre_proc import txt_spans_extract
|
||||
@@ -15,7 +17,7 @@ class MagicModel:
|
||||
|
||||
PP_DOCLAYOUT_V2_LABELS_TO_BLOCK_TYPES = {
|
||||
"abstract": BlockType.ABSTRACT,
|
||||
"algorithm": BlockType.ALGORITHM,
|
||||
"algorithm": BlockType.CODE,
|
||||
"aside_text": BlockType.ASIDE_TEXT,
|
||||
"chart": BlockType.CHART,
|
||||
"content": BlockType.INDEX,
|
||||
@@ -39,7 +41,7 @@ class MagicModel:
|
||||
"vision_footnote": BlockType.FOOTNOTE,
|
||||
}
|
||||
|
||||
VISUAL_MAIN_TYPES = (BlockType.IMAGE, BlockType.TABLE, BlockType.CHART, BlockType.ALGORITHM)
|
||||
VISUAL_MAIN_TYPES = (BlockType.IMAGE, BlockType.TABLE, BlockType.CHART, BlockType.CODE)
|
||||
VISUAL_CHILD_TYPES = (BlockType.CAPTION, BlockType.FOOTNOTE)
|
||||
VISUAL_TYPE_MAPPING = {
|
||||
BlockType.IMAGE: {
|
||||
@@ -57,10 +59,10 @@ class MagicModel:
|
||||
"caption": BlockType.CHART_CAPTION,
|
||||
"footnote": BlockType.CHART_FOOTNOTE,
|
||||
},
|
||||
BlockType.ALGORITHM: {
|
||||
"body": BlockType.ALGORITHM_BODY,
|
||||
"caption": BlockType.ALGORITHM_CAPTION,
|
||||
"footnote": BlockType.ALGORITHM_FOOTNOTE,
|
||||
BlockType.CODE: {
|
||||
"body": BlockType.CODE_BODY,
|
||||
"caption": BlockType.CODE_CAPTION,
|
||||
"footnote": BlockType.CODE_FOOTNOTE,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,9 +131,18 @@ class MagicModel:
|
||||
block_lines = merge_spans_to_line(block['spans'])
|
||||
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
||||
|
||||
if block["type"] == BlockType.ALGORITHM:
|
||||
if block["type"] == BlockType.CODE:
|
||||
for line in sort_block_lines:
|
||||
line[ListLineTag.IS_LIST_START_LINE] = True
|
||||
code_content = _merge_para_text(
|
||||
{'lines': sort_block_lines},
|
||||
False,
|
||||
'\n'
|
||||
)
|
||||
guess_lang = guess_language_by_text(code_content)
|
||||
if guess_lang not in ["txt", "unknown"]:
|
||||
block["sub_type"] = "code"
|
||||
block["guess_lang"] = guess_lang
|
||||
|
||||
block['lines'] = sort_block_lines
|
||||
del block['spans']
|
||||
@@ -172,6 +183,14 @@ class MagicModel:
|
||||
]:
|
||||
self.discarded_blocks.append(block)
|
||||
else:
|
||||
# 单独处理code block
|
||||
if block["type"] in [BlockType.CODE]:
|
||||
for sub_block in block["blocks"]:
|
||||
if sub_block["type"] == BlockType.CODE_BODY:
|
||||
block["sub_type"] = sub_block.pop("sub_type", "algorithm")
|
||||
if block["sub_type"] == "code":
|
||||
block["guess_lang"] = sub_block.pop("guess_lang", "txt")
|
||||
|
||||
self.preproc_blocks.append(block)
|
||||
|
||||
def __build_page_blocks(self):
|
||||
@@ -179,7 +198,7 @@ class MagicModel:
|
||||
for block in self.page_blocks:
|
||||
if block["type"] in [
|
||||
BlockType.ABSTRACT,
|
||||
BlockType.ALGORITHM,
|
||||
BlockType.CODE,
|
||||
BlockType.ASIDE_TEXT,
|
||||
BlockType.INDEX,
|
||||
BlockType.DOC_TITLE,
|
||||
|
||||
@@ -48,7 +48,7 @@ def make_blocks_to_markdown(paras_of_layout,
|
||||
continue
|
||||
elif mode == MakeMode.MM_MD:
|
||||
para_text = merge_visual_blocks_to_markdown(para_block, img_buket_path)
|
||||
elif para_type == BlockType.ALGORITHM:
|
||||
elif para_type == BlockType.CODE:
|
||||
para_text = merge_visual_blocks_to_markdown(para_block)
|
||||
|
||||
if para_text.strip() == '':
|
||||
@@ -60,13 +60,14 @@ def make_blocks_to_markdown(paras_of_layout,
|
||||
|
||||
|
||||
def merge_visual_blocks_to_markdown(para_block, img_buket_path=''):
|
||||
# 将 image/table/algorithm 这类视觉块的子 block 按阅读顺序拼接成 markdown。
|
||||
# 将 image/table/chart/code 这类视觉块的子 block 按阅读顺序拼接成 markdown。
|
||||
# 这里不再写死 caption/body/footnote 的优先级,而是先展开成 segment,
|
||||
# 再根据 markdown_line / html_block 两类片段决定分隔方式。
|
||||
rendered_segments = []
|
||||
|
||||
for block in get_blocks_in_index_order(para_block.get('blocks', [])):
|
||||
rendered_segments.extend(render_visual_block_segments(block, img_buket_path))
|
||||
render_block = _inherit_parent_code_render_metadata(block, para_block)
|
||||
rendered_segments.extend(render_visual_block_segments(render_block, img_buket_path))
|
||||
|
||||
para_text = ''
|
||||
prev_segment_kind = None
|
||||
@@ -91,6 +92,27 @@ def get_blocks_in_index_order(blocks):
|
||||
]
|
||||
|
||||
|
||||
def _inherit_parent_code_render_metadata(block, parent_block):
|
||||
# pipeline_magic_model 会把 code_body 的 sub_type/guess_lang 提升到父 code block。
|
||||
# markdown 渲染 code_body 时需要把这两个字段临时透传回来,但不能修改原始输入。
|
||||
if block.get('type') != BlockType.CODE_BODY:
|
||||
return block
|
||||
if parent_block.get('type') != BlockType.CODE:
|
||||
return block
|
||||
|
||||
needs_sub_type = 'sub_type' not in block and 'sub_type' in parent_block
|
||||
needs_guess_lang = 'guess_lang' not in block and 'guess_lang' in parent_block
|
||||
if not needs_sub_type and not needs_guess_lang:
|
||||
return block
|
||||
|
||||
render_block = dict(block)
|
||||
if needs_sub_type:
|
||||
render_block['sub_type'] = parent_block['sub_type']
|
||||
if needs_guess_lang:
|
||||
render_block['guess_lang'] = parent_block['guess_lang']
|
||||
return render_block
|
||||
|
||||
|
||||
def render_visual_block_segments(block, img_buket_path=''):
|
||||
# 将单个视觉子 block 渲染成一个或多个 segment。
|
||||
# 文本类子块统一输出 markdown_line;
|
||||
@@ -102,9 +124,9 @@ def render_visual_block_segments(block, img_buket_path=''):
|
||||
BlockType.IMAGE_FOOTNOTE,
|
||||
BlockType.TABLE_CAPTION,
|
||||
BlockType.TABLE_FOOTNOTE,
|
||||
BlockType.ALGORITHM_BODY,
|
||||
BlockType.ALGORITHM_CAPTION,
|
||||
BlockType.ALGORITHM_FOOTNOTE,
|
||||
BlockType.CODE_BODY,
|
||||
BlockType.CODE_CAPTION,
|
||||
BlockType.CODE_FOOTNOTE,
|
||||
BlockType.CHART_CAPTION,
|
||||
BlockType.CHART_FOOTNOTE,
|
||||
]:
|
||||
@@ -176,6 +198,22 @@ CJK_LANGS = {'zh', 'ja', 'ko'}
|
||||
|
||||
|
||||
def merge_para_with_text(para_block):
|
||||
if _is_fenced_code_block(para_block):
|
||||
code_text = _merge_para_text(
|
||||
para_block,
|
||||
escape_markdown=False,
|
||||
list_line_break='\n',
|
||||
)
|
||||
if not code_text:
|
||||
return ''
|
||||
code_text = '\n'.join(line.rstrip() for line in code_text.split('\n'))
|
||||
guess_lang = para_block.get('guess_lang', 'txt') or 'txt'
|
||||
return f"```{guess_lang}\n{code_text}\n```"
|
||||
|
||||
return _merge_para_text(para_block)
|
||||
|
||||
|
||||
def _merge_para_text(para_block, escape_markdown=True, list_line_break=' \n'):
|
||||
# 将普通文本段落 block 渲染成 markdown 字符串。
|
||||
# 处理流程分为三层:
|
||||
# 1. 先收集文本内容做语言检测
|
||||
@@ -185,12 +223,12 @@ def merge_para_with_text(para_block):
|
||||
para_parts = []
|
||||
|
||||
for line_idx, line in enumerate(para_block['lines']):
|
||||
line_prefix = _line_prefix(line_idx, line)
|
||||
line_prefix = _line_prefix(line_idx, line, list_line_break)
|
||||
if line_prefix:
|
||||
para_parts.append(line_prefix)
|
||||
|
||||
for span_idx, span in enumerate(line['spans']):
|
||||
rendered_span = _render_span(span)
|
||||
rendered_span = _render_span(span, escape_markdown=escape_markdown)
|
||||
if rendered_span is None:
|
||||
continue
|
||||
|
||||
@@ -211,6 +249,13 @@ def merge_para_with_text(para_block):
|
||||
return ''.join(para_parts).rstrip()
|
||||
|
||||
|
||||
def _is_fenced_code_block(para_block):
|
||||
return (
|
||||
para_block.get('type') == BlockType.CODE_BODY
|
||||
and para_block.get('sub_type') == BlockType.CODE
|
||||
)
|
||||
|
||||
|
||||
def _collect_text_for_lang_detection(para_block):
|
||||
# 只收集 TEXT span 的内容,用于语言检测。
|
||||
# 这里会先做全角转半角,但不会修改原始输入数据。
|
||||
@@ -228,14 +273,16 @@ def _normalize_text_content(content):
|
||||
return full_to_half_exclude_marks(content or '')
|
||||
|
||||
|
||||
def _render_span(span):
|
||||
def _render_span(span, escape_markdown=True):
|
||||
# 将单个 span 渲染成 markdown 片段。
|
||||
# 这里只负责“渲染成什么文本”,不决定后面是否补空格。
|
||||
span_type = span['type']
|
||||
content = ''
|
||||
|
||||
if span_type == ContentType.TEXT:
|
||||
content = escape_special_markdown_char(_normalize_text_content(span.get('content', '')))
|
||||
content = _normalize_text_content(span.get('content', ''))
|
||||
if escape_markdown:
|
||||
content = escape_special_markdown_char(content)
|
||||
elif span_type == ContentType.INLINE_EQUATION:
|
||||
if span.get('content', ''):
|
||||
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
|
||||
@@ -283,11 +330,11 @@ def _join_rendered_span(para_block, block_lang, line, line_idx, span_idx, span_t
|
||||
return content, ' '
|
||||
|
||||
|
||||
def _line_prefix(line_idx, line):
|
||||
def _line_prefix(line_idx, line, list_line_break=' \n'):
|
||||
# 处理进入新 list item 前的 block 级换行。
|
||||
# 这里保留历史语义:list 起始行前插入一个 hard break。
|
||||
if line_idx >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
|
||||
return ' \n'
|
||||
return list_line_break
|
||||
return ''
|
||||
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ class BlockType:
|
||||
IMAGE_BODY = 'image_body'
|
||||
TABLE_BODY = 'table_body'
|
||||
CHART_BODY = 'chart_body'
|
||||
ALGORITHM_BODY = 'algorithm_body'
|
||||
CAPTION = 'caption' # generic caption type (e.g., for Word documents)
|
||||
IMAGE_CAPTION = 'image_caption'
|
||||
TABLE_CAPTION = 'table_caption'
|
||||
@@ -17,7 +16,6 @@ class BlockType:
|
||||
IMAGE_FOOTNOTE = 'image_footnote'
|
||||
TABLE_FOOTNOTE = 'table_footnote'
|
||||
CHART_FOOTNOTE = 'chart_footnote'
|
||||
ALGORITHM_FOOTNOTE = 'algorithm_footnote'
|
||||
TEXT = 'text'
|
||||
TITLE = 'title'
|
||||
INTERLINE_EQUATION = 'interline_equation'
|
||||
@@ -30,6 +28,7 @@ class BlockType:
|
||||
CODE = "code"
|
||||
CODE_BODY = "code_body"
|
||||
CODE_CAPTION = "code_caption"
|
||||
CODE_FOOTNOTE = "code_footnote"
|
||||
ALGORITHM = "algorithm"
|
||||
REF_TEXT = "ref_text"
|
||||
PHONETIC = "phonetic"
|
||||
|
||||
Reference in New Issue
Block a user