diff --git a/mineru/backend/hybrid/hybrid_analyze.py b/mineru/backend/hybrid/hybrid_analyze.py index 4bf30483..31da7321 100644 --- a/mineru/backend/hybrid/hybrid_analyze.py +++ b/mineru/backend/hybrid/hybrid_analyze.py @@ -443,7 +443,7 @@ def doc_analyze( ) clean_memory(device) - return middle_json, results + return middle_json, results, _vlm_ocr_enable async def aio_doc_analyze( @@ -509,5 +509,5 @@ async def aio_doc_analyze( ) clean_memory(device) - return middle_json, results + return middle_json, results, _vlm_ocr_enable diff --git a/mineru/cli/common.py b/mineru/cli/common.py index 609eda56..fb20bed7 100644 --- a/mineru/cli/common.py +++ b/mineru/cli/common.py @@ -327,7 +327,6 @@ def _process_hybrid( **kwargs, ): """同步处理hybrid后端逻辑""" - f_draw_span_bbox = False if not backend.endswith("client"): server_url = None @@ -336,7 +335,7 @@ def _process_hybrid( local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}") image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) - middle_json, infer_result = hybrid_doc_analyze( + middle_json, infer_result, _vlm_ocr_enable= hybrid_doc_analyze( pdf_bytes, image_writer=image_writer, backend=backend, @@ -349,6 +348,9 @@ def _process_hybrid( pdf_info = middle_json["pdf_info"] + # f_draw_span_bbox = not _vlm_ocr_enable + f_draw_span_bbox = False + _process_output( pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir, md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf, @@ -377,7 +379,7 @@ async def _async_process_hybrid( **kwargs, ): """异步处理hybrid后端逻辑""" - f_draw_span_bbox = False + if not backend.endswith("client"): server_url = None @@ -386,7 +388,7 @@ async def _async_process_hybrid( local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}") image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) - middle_json, infer_result = await aio_hybrid_doc_analyze( + middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze( pdf_bytes, image_writer=image_writer, backend=backend, @@ -399,6 +401,9 @@ async def _async_process_hybrid( pdf_info = middle_json["pdf_info"] + # f_draw_span_bbox = not _vlm_ocr_enable + f_draw_span_bbox = False + _process_output( pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir, md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf, diff --git a/mineru/utils/span_pre_proc.py b/mineru/utils/span_pre_proc.py index 98c36477..159b08f0 100644 --- a/mineru/utils/span_pre_proc.py +++ b/mineru/utils/span_pre_proc.py @@ -1,5 +1,6 @@ # Copyright (c) Opendatalab. All rights reserved. import collections +import math import re import statistics @@ -128,8 +129,9 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded page_all_lines = [] for block in page_dict['blocks']: for line in block['lines']: - if 0 < abs(line['rotation']) < 90: - # 旋转角度在0-90度之间的行,直接跳过 + rotation_degrees = math.degrees(line['rotation']) + # 旋转角度不为0, 90, 180, 270的行,直接跳过(rotation_degrees的值可能不为整数) + if not any(abs(rotation_degrees - angle) < 0.1 for angle in [0, 90, 180, 270]): continue page_all_lines.append(line) for span in line['spans']: @@ -159,7 +161,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]: continue if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5: - if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3: + if span['height'] > median_span_height * 2.3 and span['height'] > span['width'] * 2.3: vertical_spans.append(span) elif block in all_bboxes: useful_spans.append(span)