diff --git a/mineru/cli/common.py b/mineru/cli/common.py index 68a342f0..c0a51bfc 100644 --- a/mineru/cli/common.py +++ b/mineru/cli/common.py @@ -11,7 +11,7 @@ from loguru import logger import pypdfium2 as pdfium from mineru.data.data_reader_writer import FileBasedDataWriter -from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox +from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox from mineru.utils.engine_utils import get_vlm_engine from mineru.utils.enum_class import MakeMode from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes @@ -134,7 +134,6 @@ def _process_output( model_output=None, process_mode="vlm", ): - f_draw_line_sort_bbox = False from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make if process_mode == "pipeline": make_func = pipeline_union_make @@ -163,9 +162,6 @@ def _process_output( pdf_bytes, ) - if f_draw_line_sort_bbox: - draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_line_sort.pdf") - image_dir = str(os.path.basename(local_image_dir)) if f_dump_md: diff --git a/mineru/utils/draw_bbox.py b/mineru/utils/draw_bbox.py index 9ed0368a..ef3cd762 100644 --- a/mineru/utils/draw_bbox.py +++ b/mineru/utils/draw_bbox.py @@ -121,10 +121,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): dropped_bbox_list = [] tables_body_list, tables_caption_list, tables_footnote_list = [], [], [] imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], [] - codes_body_list, codes_caption_list = [], [] + codes_body_list, codes_caption_list, codes_footnote_list = [], [], [] titles_list = [] texts_list = [] - interequations_list = [] + interline_equations_list = [] lists_list = [] list_items_list = [] indexs_list = [] @@ -133,10 +133,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): page_dropped_list = [] tables_body, tables_caption, tables_footnote = [], [], [] imgs_body, imgs_caption, imgs_footnote = [], [], [] - codes_body, codes_caption = [], [] + codes_body, codes_caption, codes_footnote = [], [], [] titles = [] texts = [] - interequations = [] + interline_equations = [] lists = [] list_items = [] indices = [] @@ -174,12 +174,26 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): elif nested_block["type"] == BlockType.CODE_CAPTION: bbox = nested_block["bbox"] codes_caption.append(bbox) + elif nested_block["type"] == BlockType.CODE_FOOTNOTE: + bbox = nested_block["bbox"] + codes_footnote.append(bbox) + elif block["type"] == BlockType.CHART: + for nested_block in block["blocks"]: + if nested_block["type"] == BlockType.CHART_BODY: + bbox = nested_block["bbox"] + imgs_body.append(bbox) + elif nested_block["type"] == BlockType.CHART_CAPTION: + bbox = nested_block["bbox"] + imgs_caption.append(bbox) + elif nested_block["type"] == BlockType.CHART_FOOTNOTE: + bbox = nested_block["bbox"] + imgs_footnote.append(bbox) elif block["type"] == BlockType.TITLE: titles.append(bbox) - elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT]: + elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT, BlockType.ABSTRACT]: texts.append(bbox) elif block["type"] == BlockType.INTERLINE_EQUATION: - interequations.append(bbox) + interline_equations.append(bbox) elif block["type"] == BlockType.LIST: lists.append(bbox) if "blocks" in block: @@ -196,22 +210,23 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): imgs_footnote_list.append(imgs_footnote) titles_list.append(titles) texts_list.append(texts) - interequations_list.append(interequations) + interline_equations_list.append(interline_equations) lists_list.append(lists) list_items_list.append(list_items) indexs_list.append(indices) codes_body_list.append(codes_body) codes_caption_list.append(codes_caption) + codes_footnote_list.append(codes_footnote) layout_bbox_list = [] - table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3} for page in pdf_info: page_block_list = [] for block in page["para_blocks"]: if block["type"] in [ BlockType.TEXT, BlockType.REF_TEXT, + BlockType.ABSTRACT, BlockType.TITLE, BlockType.INTERLINE_EQUATION, BlockType.LIST, @@ -219,21 +234,12 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ]: bbox = block["bbox"] page_block_list.append(bbox) - elif block["type"] in [BlockType.IMAGE]: + elif block["type"] in [BlockType.IMAGE, BlockType.CHART, BlockType.CODE, BlockType.TABLE]: for sub_block in block["blocks"]: - bbox = sub_block["bbox"] - page_block_list.append(bbox) - elif block["type"] in [BlockType.TABLE]: - sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]]) - for sub_block in sorted_blocks: if sub_block.get(SplitFlag.CROSS_PAGE, False): continue bbox = sub_block["bbox"] page_block_list.append(bbox) - elif block["type"] in [BlockType.CODE]: - for sub_block in block["blocks"]: - bbox = sub_block["bbox"] - page_block_list.append(bbox) layout_bbox_list.append(page_block_list) @@ -252,6 +258,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True) c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True) + c = draw_bbox_without_number(i, codes_footnote_list, page, c, [229, 204, 255], True) c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True) c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True) c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True) @@ -261,7 +268,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True) c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True) c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True) - c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True) + c = draw_bbox_without_number(i, interline_equations_list, page, c, [0, 255, 0], True) c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True) c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False) c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True) @@ -392,87 +399,6 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename): output_pdf.write(f) -def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename): - layout_bbox_list = [] - - for page in pdf_info: - page_line_list = [] - for block in page['preproc_blocks']: - if block['type'] in [BlockType.TEXT]: - for line in block['lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - elif block['type'] in [BlockType.TITLE, BlockType.INTERLINE_EQUATION]: - if 'virtual_lines' in block: - if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None: - for line in block['virtual_lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - else: - for line in block['lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]: - for sub_block in block['blocks']: - if sub_block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]: - if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None: - for line in sub_block['virtual_lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - else: - for line in sub_block['lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - elif sub_block['type'] in [BlockType.IMAGE_CAPTION, BlockType.TABLE_CAPTION, BlockType.IMAGE_FOOTNOTE, BlockType.TABLE_FOOTNOTE]: - for line in sub_block['lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - sorted_bboxes = sorted(page_line_list, key=lambda x: x['index']) - layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes) - pdf_bytes_io = BytesIO(pdf_bytes) - pdf_docs = PdfReader(pdf_bytes_io) - output_pdf = PdfWriter() - - for i, page in enumerate(pdf_docs.pages): - # 获取原始页面尺寸 - page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3]) - custom_page_size = (page_width, page_height) - - packet = BytesIO() - # 使用原始PDF的尺寸创建canvas - c = canvas.Canvas(packet, pagesize=custom_page_size) - - # 获取当前页面的数据 - draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False) - - c.save() - packet.seek(0) - overlay_pdf = PdfReader(packet) - - # 添加检查确保overlay_pdf.pages不为空 - if len(overlay_pdf.pages) > 0: - new_page = PageObject(pdf=None) - new_page.update(page) - page = new_page - page.merge_page(overlay_pdf.pages[0]) - else: - # 记录日志并继续处理下一个页面 - # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF") - pass - - output_pdf.add_page(page) - - # Save the PDF - with open(f"{out_path}/{filename}", "wb") as f: - output_pdf.write(f) - - if __name__ == "__main__": # 读取PDF文件 pdf_path = "examples/demo1.pdf"