refactor: enhance hybrid backend logic and improve span processing

2026-03-27 02:58:54 +07:00 · 2025-12-29 19:03:30 +08:00
parent 7750d864ed
commit 037e5f2460
3 changed files with 16 additions and 9 deletions
--- a/mineru/backend/hybrid/hybrid_analyze.py
+++ b/mineru/backend/hybrid/hybrid_analyze.py
@@ -443,7 +443,7 @@ def doc_analyze(
    )

    clean_memory(device)
-    return middle_json, results
+    return middle_json, results, _vlm_ocr_enable


 async def aio_doc_analyze(
@@ -509,5 +509,5 @@ async def aio_doc_analyze(
    )

    clean_memory(device)
-    return middle_json, results
+    return middle_json, results, _vlm_ocr_enable

--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -327,7 +327,6 @@ def _process_hybrid(
        **kwargs,
 ):
    """同步处理hybrid后端逻辑"""
-    f_draw_span_bbox = False
    if not backend.endswith("client"):
        server_url = None

@@ -336,7 +335,7 @@ def _process_hybrid(
        local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
        image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)

-        middle_json, infer_result = hybrid_doc_analyze(
+        middle_json, infer_result, _vlm_ocr_enable= hybrid_doc_analyze(
            pdf_bytes,
            image_writer=image_writer,
            backend=backend,
@@ -349,6 +348,9 @@ def _process_hybrid(

        pdf_info = middle_json["pdf_info"]

+        # f_draw_span_bbox = not _vlm_ocr_enable
+        f_draw_span_bbox = False
+
        _process_output(
            pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
            md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
@@ -377,7 +379,7 @@ async def _async_process_hybrid(
        **kwargs,
 ):
    """异步处理hybrid后端逻辑"""
-    f_draw_span_bbox = False
+
    if not backend.endswith("client"):
        server_url = None

@@ -386,7 +388,7 @@ async def _async_process_hybrid(
        local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
        image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)

-        middle_json, infer_result = await aio_hybrid_doc_analyze(
+        middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze(
            pdf_bytes,
            image_writer=image_writer,
            backend=backend,
@@ -399,6 +401,9 @@ async def _async_process_hybrid(

        pdf_info = middle_json["pdf_info"]

+        # f_draw_span_bbox = not _vlm_ocr_enable
+        f_draw_span_bbox = False
+
        _process_output(
            pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
            md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
--- a/mineru/utils/span_pre_proc.py
+++ b/mineru/utils/span_pre_proc.py
@@ -1,5 +1,6 @@
 # Copyright (c) Opendatalab. All rights reserved.
 import collections
+import math
 import re
 import statistics

@@ -128,8 +129,9 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
    page_all_lines = []
    for block in page_dict['blocks']:
        for line in block['lines']:
-            if 0 < abs(line['rotation']) < 90:
-                # 旋转角度在0-90度之间的行，直接跳过
+            rotation_degrees = math.degrees(line['rotation'])
+            # 旋转角度不为0, 90, 180, 270的行，直接跳过（rotation_degrees的值可能不为整数）
+            if not any(abs(rotation_degrees - angle) < 0.1 for angle in [0, 90, 180, 270]):
                continue
            page_all_lines.append(line)
            for span in line['spans']:
@@ -159,7 +161,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
                if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
                    continue
                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
-                    if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
+                    if span['height'] > median_span_height * 2.3 and span['height'] > span['width'] * 2.3:
                        vertical_spans.append(span)
                    elif block in all_bboxes:
                        useful_spans.append(span)