mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
4 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7242a4a76e | ||
|
|
6cbf7fabcf | ||
|
|
5b9fa871bd | ||
|
|
bc339320ab |
@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
|
||||
from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
|
||||
from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
|
||||
from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
|
||||
from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval
|
||||
|
||||
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
|
||||
titleDetectionException_msg = TitleDetectionException().message
|
||||
@@ -446,6 +447,10 @@ def parse_pdf_by_model(
|
||||
==================================================================================================================================
|
||||
进入段落处理-2阶段
|
||||
"""
|
||||
|
||||
# 处理行内文字间距较大问题
|
||||
pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
para_process_pipeline = ParaProcessPipeline()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from magic_pdf.libs.boxbase import _is_in # 正则
|
||||
from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio # 正则
|
||||
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
||||
|
||||
|
||||
@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list):
|
||||
dump_list.append(all_bbox_list[i])
|
||||
elif _is_in(bbox2, bbox1):
|
||||
dump_list.append(all_bbox_list[j])
|
||||
|
||||
else:
|
||||
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
|
||||
if ratio > 0.7:
|
||||
s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
||||
s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
|
||||
if s2 > s1:
|
||||
dump_list.append(all_bbox_list[i])
|
||||
else:
|
||||
dump_list.append(all_bbox_list[i])
|
||||
|
||||
# 遍历需要删除的列表中的每个元素
|
||||
for item in dump_list:
|
||||
|
||||
|
||||
29
magic_pdf/pre_proc/solve_line_alien.py
Normal file
29
magic_pdf/pre_proc/solve_line_alien.py
Normal file
@@ -0,0 +1,29 @@
|
||||
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
|
||||
"""解决行内文本间距过大问题"""
|
||||
for i in range(len(pdf_info_dict)):
|
||||
|
||||
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
|
||||
|
||||
for block in text_blocks:
|
||||
|
||||
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
|
||||
|
||||
for line in block['lines']:
|
||||
|
||||
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
|
||||
# line_box = [x1, y1, x2, y2]
|
||||
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
|
||||
# if len(line['spans']) == 1:
|
||||
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
|
||||
|
||||
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user