refactor: enhance hybrid backend logic and improve span processing

This commit is contained in:
myhloli
2025-12-29 19:03:30 +08:00
parent 7750d864ed
commit 037e5f2460
3 changed files with 16 additions and 9 deletions

View File

@@ -443,7 +443,7 @@ def doc_analyze(
)
clean_memory(device)
return middle_json, results
return middle_json, results, _vlm_ocr_enable
async def aio_doc_analyze(
@@ -509,5 +509,5 @@ async def aio_doc_analyze(
)
clean_memory(device)
return middle_json, results
return middle_json, results, _vlm_ocr_enable

View File

@@ -327,7 +327,6 @@ def _process_hybrid(
**kwargs,
):
"""同步处理hybrid后端逻辑"""
f_draw_span_bbox = False
if not backend.endswith("client"):
server_url = None
@@ -336,7 +335,7 @@ def _process_hybrid(
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = hybrid_doc_analyze(
middle_json, infer_result, _vlm_ocr_enable= hybrid_doc_analyze(
pdf_bytes,
image_writer=image_writer,
backend=backend,
@@ -349,6 +348,9 @@ def _process_hybrid(
pdf_info = middle_json["pdf_info"]
# f_draw_span_bbox = not _vlm_ocr_enable
f_draw_span_bbox = False
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
@@ -377,7 +379,7 @@ async def _async_process_hybrid(
**kwargs,
):
"""异步处理hybrid后端逻辑"""
f_draw_span_bbox = False
if not backend.endswith("client"):
server_url = None
@@ -386,7 +388,7 @@ async def _async_process_hybrid(
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = await aio_hybrid_doc_analyze(
middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze(
pdf_bytes,
image_writer=image_writer,
backend=backend,
@@ -399,6 +401,9 @@ async def _async_process_hybrid(
pdf_info = middle_json["pdf_info"]
# f_draw_span_bbox = not _vlm_ocr_enable
f_draw_span_bbox = False
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,

View File

@@ -1,5 +1,6 @@
# Copyright (c) Opendatalab. All rights reserved.
import collections
import math
import re
import statistics
@@ -128,8 +129,9 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
page_all_lines = []
for block in page_dict['blocks']:
for line in block['lines']:
if 0 < abs(line['rotation']) < 90:
# 旋转角度在0-90度之间的行直接跳过
rotation_degrees = math.degrees(line['rotation'])
# 旋转角度不为0, 90, 180, 270的行直接跳过rotation_degrees的值可能不为整数
if not any(abs(rotation_degrees - angle) < 0.1 for angle in [0, 90, 180, 270]):
continue
page_all_lines.append(line)
for span in line['spans']:
@@ -159,7 +161,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
continue
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
if span['height'] > median_span_height * 2.3 and span['height'] > span['width'] * 2.3:
vertical_spans.append(span)
elif block in all_bboxes:
useful_spans.append(span)