From 09fc22fcc20ec61316cf8f63cefdf0fd1ea3b55a Mon Sep 17 00:00:00 2001 From: myhloli Date: Sat, 21 Mar 2026 03:33:34 +0800 Subject: [PATCH] feat: add support for SEAL block type in bbox drawing and update image handling in markdown content --- .../backend/pipeline/pipeline_middle_json_mkcontent.py | 9 ++++++++- mineru/utils/draw_bbox.py | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py b/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py index 49a3b136..dc464a35 100644 --- a/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +++ b/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py @@ -32,7 +32,14 @@ def make_blocks_to_markdown(paras_of_layout, if para_block['lines'][0]['spans'][0].get('content', ''): para_text = merge_para_with_text(para_block) else: - para_text += f"![]({img_buket_path}/{para_block['lines'][0]['spans'][0]['image_path']})" + para_text = f"![]({img_buket_path}/{para_block['lines'][0]['spans'][0]['image_path']})" + elif para_type == BlockType.SEAL: + if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0: + continue + para_text = f"![]({img_buket_path}/{para_block['lines'][0]['spans'][0]['image_path']})" + if para_block['lines'][0]['spans'][0].get('content', []): + content = " ".join(para_block['lines'][0]['spans'][0]['content']) + para_text += f" \n{content}" elif para_type == BlockType.IMAGE: if mode == MakeMode.NLP_MD: continue diff --git a/mineru/utils/draw_bbox.py b/mineru/utils/draw_bbox.py index ef3cd762..9a93ebbf 100644 --- a/mineru/utils/draw_bbox.py +++ b/mineru/utils/draw_bbox.py @@ -188,6 +188,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): elif nested_block["type"] == BlockType.CHART_FOOTNOTE: bbox = nested_block["bbox"] imgs_footnote.append(bbox) + elif block["type"] == BlockType.SEAL: + imgs_body.append(bbox) elif block["type"] == BlockType.TITLE: titles.append(bbox) elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT, BlockType.ABSTRACT]: