mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat: enhance code block handling by adding support for code footnotes and refining layout processing
This commit is contained in:
@@ -11,7 +11,7 @@ from loguru import logger
|
||||
import pypdfium2 as pdfium
|
||||
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
|
||||
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
|
||||
from mineru.utils.engine_utils import get_vlm_engine
|
||||
from mineru.utils.enum_class import MakeMode
|
||||
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
|
||||
@@ -134,7 +134,6 @@ def _process_output(
|
||||
model_output=None,
|
||||
process_mode="vlm",
|
||||
):
|
||||
f_draw_line_sort_bbox = False
|
||||
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
|
||||
if process_mode == "pipeline":
|
||||
make_func = pipeline_union_make
|
||||
@@ -163,9 +162,6 @@ def _process_output(
|
||||
pdf_bytes,
|
||||
)
|
||||
|
||||
if f_draw_line_sort_bbox:
|
||||
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_line_sort.pdf")
|
||||
|
||||
image_dir = str(os.path.basename(local_image_dir))
|
||||
|
||||
if f_dump_md:
|
||||
|
||||
@@ -121,10 +121,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
dropped_bbox_list = []
|
||||
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
|
||||
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
|
||||
codes_body_list, codes_caption_list = [], []
|
||||
codes_body_list, codes_caption_list, codes_footnote_list = [], [], []
|
||||
titles_list = []
|
||||
texts_list = []
|
||||
interequations_list = []
|
||||
interline_equations_list = []
|
||||
lists_list = []
|
||||
list_items_list = []
|
||||
indexs_list = []
|
||||
@@ -133,10 +133,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
page_dropped_list = []
|
||||
tables_body, tables_caption, tables_footnote = [], [], []
|
||||
imgs_body, imgs_caption, imgs_footnote = [], [], []
|
||||
codes_body, codes_caption = [], []
|
||||
codes_body, codes_caption, codes_footnote = [], [], []
|
||||
titles = []
|
||||
texts = []
|
||||
interequations = []
|
||||
interline_equations = []
|
||||
lists = []
|
||||
list_items = []
|
||||
indices = []
|
||||
@@ -174,12 +174,26 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
elif nested_block["type"] == BlockType.CODE_CAPTION:
|
||||
bbox = nested_block["bbox"]
|
||||
codes_caption.append(bbox)
|
||||
elif nested_block["type"] == BlockType.CODE_FOOTNOTE:
|
||||
bbox = nested_block["bbox"]
|
||||
codes_footnote.append(bbox)
|
||||
elif block["type"] == BlockType.CHART:
|
||||
for nested_block in block["blocks"]:
|
||||
if nested_block["type"] == BlockType.CHART_BODY:
|
||||
bbox = nested_block["bbox"]
|
||||
imgs_body.append(bbox)
|
||||
elif nested_block["type"] == BlockType.CHART_CAPTION:
|
||||
bbox = nested_block["bbox"]
|
||||
imgs_caption.append(bbox)
|
||||
elif nested_block["type"] == BlockType.CHART_FOOTNOTE:
|
||||
bbox = nested_block["bbox"]
|
||||
imgs_footnote.append(bbox)
|
||||
elif block["type"] == BlockType.TITLE:
|
||||
titles.append(bbox)
|
||||
elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT]:
|
||||
elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT, BlockType.ABSTRACT]:
|
||||
texts.append(bbox)
|
||||
elif block["type"] == BlockType.INTERLINE_EQUATION:
|
||||
interequations.append(bbox)
|
||||
interline_equations.append(bbox)
|
||||
elif block["type"] == BlockType.LIST:
|
||||
lists.append(bbox)
|
||||
if "blocks" in block:
|
||||
@@ -196,22 +210,23 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
imgs_footnote_list.append(imgs_footnote)
|
||||
titles_list.append(titles)
|
||||
texts_list.append(texts)
|
||||
interequations_list.append(interequations)
|
||||
interline_equations_list.append(interline_equations)
|
||||
lists_list.append(lists)
|
||||
list_items_list.append(list_items)
|
||||
indexs_list.append(indices)
|
||||
codes_body_list.append(codes_body)
|
||||
codes_caption_list.append(codes_caption)
|
||||
codes_footnote_list.append(codes_footnote)
|
||||
|
||||
layout_bbox_list = []
|
||||
|
||||
table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}
|
||||
for page in pdf_info:
|
||||
page_block_list = []
|
||||
for block in page["para_blocks"]:
|
||||
if block["type"] in [
|
||||
BlockType.TEXT,
|
||||
BlockType.REF_TEXT,
|
||||
BlockType.ABSTRACT,
|
||||
BlockType.TITLE,
|
||||
BlockType.INTERLINE_EQUATION,
|
||||
BlockType.LIST,
|
||||
@@ -219,21 +234,12 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
]:
|
||||
bbox = block["bbox"]
|
||||
page_block_list.append(bbox)
|
||||
elif block["type"] in [BlockType.IMAGE]:
|
||||
elif block["type"] in [BlockType.IMAGE, BlockType.CHART, BlockType.CODE, BlockType.TABLE]:
|
||||
for sub_block in block["blocks"]:
|
||||
bbox = sub_block["bbox"]
|
||||
page_block_list.append(bbox)
|
||||
elif block["type"] in [BlockType.TABLE]:
|
||||
sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])
|
||||
for sub_block in sorted_blocks:
|
||||
if sub_block.get(SplitFlag.CROSS_PAGE, False):
|
||||
continue
|
||||
bbox = sub_block["bbox"]
|
||||
page_block_list.append(bbox)
|
||||
elif block["type"] in [BlockType.CODE]:
|
||||
for sub_block in block["blocks"]:
|
||||
bbox = sub_block["bbox"]
|
||||
page_block_list.append(bbox)
|
||||
|
||||
layout_bbox_list.append(page_block_list)
|
||||
|
||||
@@ -252,6 +258,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
|
||||
c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True)
|
||||
c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True)
|
||||
c = draw_bbox_without_number(i, codes_footnote_list, page, c, [229, 204, 255], True)
|
||||
c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
|
||||
c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
|
||||
c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
|
||||
@@ -261,7 +268,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
|
||||
c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
|
||||
c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
|
||||
c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
|
||||
c = draw_bbox_without_number(i, interline_equations_list, page, c, [0, 255, 0], True)
|
||||
c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
|
||||
c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
|
||||
c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
|
||||
@@ -392,87 +399,6 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
output_pdf.write(f)
|
||||
|
||||
|
||||
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||||
layout_bbox_list = []
|
||||
|
||||
for page in pdf_info:
|
||||
page_line_list = []
|
||||
for block in page['preproc_blocks']:
|
||||
if block['type'] in [BlockType.TEXT]:
|
||||
for line in block['lines']:
|
||||
bbox = line['bbox']
|
||||
index = line['index']
|
||||
page_line_list.append({'index': index, 'bbox': bbox})
|
||||
elif block['type'] in [BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
|
||||
if 'virtual_lines' in block:
|
||||
if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
|
||||
for line in block['virtual_lines']:
|
||||
bbox = line['bbox']
|
||||
index = line['index']
|
||||
page_line_list.append({'index': index, 'bbox': bbox})
|
||||
else:
|
||||
for line in block['lines']:
|
||||
bbox = line['bbox']
|
||||
index = line['index']
|
||||
page_line_list.append({'index': index, 'bbox': bbox})
|
||||
elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
|
||||
for sub_block in block['blocks']:
|
||||
if sub_block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
|
||||
if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
|
||||
for line in sub_block['virtual_lines']:
|
||||
bbox = line['bbox']
|
||||
index = line['index']
|
||||
page_line_list.append({'index': index, 'bbox': bbox})
|
||||
else:
|
||||
for line in sub_block['lines']:
|
||||
bbox = line['bbox']
|
||||
index = line['index']
|
||||
page_line_list.append({'index': index, 'bbox': bbox})
|
||||
elif sub_block['type'] in [BlockType.IMAGE_CAPTION, BlockType.TABLE_CAPTION, BlockType.IMAGE_FOOTNOTE, BlockType.TABLE_FOOTNOTE]:
|
||||
for line in sub_block['lines']:
|
||||
bbox = line['bbox']
|
||||
index = line['index']
|
||||
page_line_list.append({'index': index, 'bbox': bbox})
|
||||
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
|
||||
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
|
||||
pdf_bytes_io = BytesIO(pdf_bytes)
|
||||
pdf_docs = PdfReader(pdf_bytes_io)
|
||||
output_pdf = PdfWriter()
|
||||
|
||||
for i, page in enumerate(pdf_docs.pages):
|
||||
# 获取原始页面尺寸
|
||||
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
|
||||
custom_page_size = (page_width, page_height)
|
||||
|
||||
packet = BytesIO()
|
||||
# 使用原始PDF的尺寸创建canvas
|
||||
c = canvas.Canvas(packet, pagesize=custom_page_size)
|
||||
|
||||
# 获取当前页面的数据
|
||||
draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False)
|
||||
|
||||
c.save()
|
||||
packet.seek(0)
|
||||
overlay_pdf = PdfReader(packet)
|
||||
|
||||
# 添加检查确保overlay_pdf.pages不为空
|
||||
if len(overlay_pdf.pages) > 0:
|
||||
new_page = PageObject(pdf=None)
|
||||
new_page.update(page)
|
||||
page = new_page
|
||||
page.merge_page(overlay_pdf.pages[0])
|
||||
else:
|
||||
# 记录日志并继续处理下一个页面
|
||||
# logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
|
||||
pass
|
||||
|
||||
output_pdf.add_page(page)
|
||||
|
||||
# Save the PDF
|
||||
with open(f"{out_path}/{filename}", "wb") as f:
|
||||
output_pdf.write(f)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 读取PDF文件
|
||||
pdf_path = "examples/demo1.pdf"
|
||||
|
||||
Reference in New Issue
Block a user