mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
修复一个citation marker识别导致的行内部分span被删除问题,从而修复某些公式无法替换成功的错误
This commit is contained in:
@@ -271,9 +271,8 @@ def parse_pdf_by_model(
|
||||
""""以下进入到公式替换环节 """
|
||||
char_level_text_blocks = page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
||||
remain_text_blocks = combine_chars_to_pymudict(remain_text_blocks, char_level_text_blocks)# 合并chars
|
||||
remain_text_blocks = remove_citation_marker(remain_text_blocks) # 先把角标去掉
|
||||
|
||||
remain_text_blocks = replace_equations_in_textblock(remain_text_blocks, inline_eq_info, interline_eq_info)
|
||||
remain_text_blocks = remove_citation_marker(remain_text_blocks) # 公式替换之后去角标,防止公式无法替换成功。但是这样也会带来个问题就是把角标当公式。各有优劣。
|
||||
remain_text_blocks = remove_chars_in_text_blocks(remain_text_blocks) # 减少中间态数据体积
|
||||
#debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in inline_eq_info], [b['bbox'] for b in interline_eq_info], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 3)
|
||||
|
||||
|
||||
@@ -114,12 +114,16 @@ def remove_citation_marker(with_char_text_blcoks):
|
||||
|
||||
# 找到高度最高的span作为位置比较的基准
|
||||
max_hi_span = line['spans'][0]['bbox']
|
||||
min_font_sz = 10000
|
||||
min_font_sz = 10000 # line里最小的字体
|
||||
max_font_sz = 0 # line里最大的字体
|
||||
|
||||
for s in line['spans']:
|
||||
if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
|
||||
max_hi_span = s['bbox']
|
||||
if min_font_sz>s['size']:
|
||||
min_font_sz = s['size']
|
||||
if max_font_sz<s['size']:
|
||||
max_font_sz = s['size']
|
||||
|
||||
base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
|
||||
|
||||
@@ -130,6 +134,9 @@ def remove_citation_marker(with_char_text_blcoks):
|
||||
span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
|
||||
span_font_sz = span['size']
|
||||
|
||||
if max_font_sz-span_font_sz<1: # 先以字体过滤正文,如果是正文就不再继续判断了
|
||||
continue
|
||||
|
||||
if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
|
||||
"""
|
||||
1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式
|
||||
|
||||
@@ -1,12 +1,38 @@
|
||||
import os
|
||||
import collections # 统计库
|
||||
import re # 正则
|
||||
import re
|
||||
from libs.boxbase import _is_in # 正则
|
||||
from libs.commons import fitz # pyMuPDF库
|
||||
import json # json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
||||
def __solve_contain_bboxs(all_bbox_list: list):
|
||||
|
||||
"""将两个公式的bbox做判断是否有包含关系,若有的话则删掉较小的bbox"""
|
||||
|
||||
dump_list = []
|
||||
for i in range(len(all_bbox_list)):
|
||||
for j in range(i + 1, len(all_bbox_list)):
|
||||
# 获取当前两个值
|
||||
bbox1 = all_bbox_list[i][:4]
|
||||
bbox2 = all_bbox_list[j][:4]
|
||||
|
||||
# 删掉较小的框
|
||||
if _is_in(bbox1, bbox2):
|
||||
dump_list.append(all_bbox_list[i])
|
||||
elif _is_in(bbox2, bbox1):
|
||||
dump_list.append(all_bbox_list[j])
|
||||
|
||||
# 遍历需要删除的列表中的每个元素
|
||||
for item in dump_list:
|
||||
|
||||
while item in all_bbox_list:
|
||||
all_bbox_list.remove(item)
|
||||
return all_bbox_list
|
||||
|
||||
|
||||
def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
"""
|
||||
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
|
||||
@@ -101,4 +127,5 @@ def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict
|
||||
for eq_box in equationIsolated_from_DocXChain_bboxs:
|
||||
eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
|
||||
|
||||
return equationEmbedding_from_DocXChain_bboxs, equationIsolated_from_DocXChain_bboxs
|
||||
deduped_embedding_eq_bboxes = __solve_contain_bboxs(equationEmbedding_from_DocXChain_bboxs)
|
||||
return deduped_embedding_eq_bboxes, equationIsolated_from_DocXChain_bboxs
|
||||
|
||||
Reference in New Issue
Block a user