mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
refactor: enhance hybrid backend logic and improve span processing
This commit is contained in:
@@ -443,7 +443,7 @@ def doc_analyze(
|
||||
)
|
||||
|
||||
clean_memory(device)
|
||||
return middle_json, results
|
||||
return middle_json, results, _vlm_ocr_enable
|
||||
|
||||
|
||||
async def aio_doc_analyze(
|
||||
@@ -509,5 +509,5 @@ async def aio_doc_analyze(
|
||||
)
|
||||
|
||||
clean_memory(device)
|
||||
return middle_json, results
|
||||
return middle_json, results, _vlm_ocr_enable
|
||||
|
||||
|
||||
@@ -327,7 +327,6 @@ def _process_hybrid(
|
||||
**kwargs,
|
||||
):
|
||||
"""同步处理hybrid后端逻辑"""
|
||||
f_draw_span_bbox = False
|
||||
if not backend.endswith("client"):
|
||||
server_url = None
|
||||
|
||||
@@ -336,7 +335,7 @@ def _process_hybrid(
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
|
||||
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
||||
|
||||
middle_json, infer_result = hybrid_doc_analyze(
|
||||
middle_json, infer_result, _vlm_ocr_enable= hybrid_doc_analyze(
|
||||
pdf_bytes,
|
||||
image_writer=image_writer,
|
||||
backend=backend,
|
||||
@@ -349,6 +348,9 @@ def _process_hybrid(
|
||||
|
||||
pdf_info = middle_json["pdf_info"]
|
||||
|
||||
# f_draw_span_bbox = not _vlm_ocr_enable
|
||||
f_draw_span_bbox = False
|
||||
|
||||
_process_output(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
@@ -377,7 +379,7 @@ async def _async_process_hybrid(
|
||||
**kwargs,
|
||||
):
|
||||
"""异步处理hybrid后端逻辑"""
|
||||
f_draw_span_bbox = False
|
||||
|
||||
if not backend.endswith("client"):
|
||||
server_url = None
|
||||
|
||||
@@ -386,7 +388,7 @@ async def _async_process_hybrid(
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
|
||||
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
||||
|
||||
middle_json, infer_result = await aio_hybrid_doc_analyze(
|
||||
middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze(
|
||||
pdf_bytes,
|
||||
image_writer=image_writer,
|
||||
backend=backend,
|
||||
@@ -399,6 +401,9 @@ async def _async_process_hybrid(
|
||||
|
||||
pdf_info = middle_json["pdf_info"]
|
||||
|
||||
# f_draw_span_bbox = not _vlm_ocr_enable
|
||||
f_draw_span_bbox = False
|
||||
|
||||
_process_output(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import collections
|
||||
import math
|
||||
import re
|
||||
import statistics
|
||||
|
||||
@@ -128,8 +129,9 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
|
||||
page_all_lines = []
|
||||
for block in page_dict['blocks']:
|
||||
for line in block['lines']:
|
||||
if 0 < abs(line['rotation']) < 90:
|
||||
# 旋转角度在0-90度之间的行,直接跳过
|
||||
rotation_degrees = math.degrees(line['rotation'])
|
||||
# 旋转角度不为0, 90, 180, 270的行,直接跳过(rotation_degrees的值可能不为整数)
|
||||
if not any(abs(rotation_degrees - angle) < 0.1 for angle in [0, 90, 180, 270]):
|
||||
continue
|
||||
page_all_lines.append(line)
|
||||
for span in line['spans']:
|
||||
@@ -159,7 +161,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
|
||||
if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
|
||||
continue
|
||||
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
||||
if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
|
||||
if span['height'] > median_span_height * 2.3 and span['height'] > span['width'] * 2.3:
|
||||
vertical_spans.append(span)
|
||||
elif block in all_bboxes:
|
||||
useful_spans.append(span)
|
||||
|
||||
Reference in New Issue
Block a user