Files
MinerU/check_inline_formula.py
2024-02-29 16:53:41 +08:00

74 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 最终版把那种text_block有重叠且inline_formula位置在重叠部分的认定整个页面都有问题所有的inline_formula都改成no_check
from libs.commons import fitz
def check_inline_formula(page, inline_formula_boxes):
"""
:param page :fitz读取的当前页的内容
:param inline_formula_boxes: list类型每一个元素是一个元祖 (L, U, R, D)
:return: inline_formula_check: list类型每一个元素是一个类别其顺序对应输入的inline_formula_boxes给每个行内公式打一个标签包括
- nocheck_inline_formula这个公式框没有与任何span相交有可能存在问题
- wrong_text_block这个公式框同时存在多个block里可能页面的text block存在问题
- false_inline_formula只涉及一个span并且只占据这个span的小部分面积判断可能不是公式
- true_inline_formula两种情况判断为公式一是横跨多个span二是只涉及一个span但是几乎占据了这个span大部分的面积
"""
# count = defaultdict(int)
## ------------------------ Text --------------------------------------------
blocks = page.get_text(
"dict",
flags=fitz.TEXTFLAGS_TEXT,
#clip=clip,
)["blocks"]
# iterate over the bboxes
inline_formula_check = []
for result in inline_formula_boxes:
(x1, y1, x2, y2) = (result[0], result[1], result[2], result[3])
## 逐个block##
in_block = 0
for bbox in blocks:
# image = cv2.rectangle(image, (int(bbox['bbox'][0]), int(bbox['bbox'][1])), (int(bbox['bbox'][2]), int(bbox['bbox'][3])), (0, 255, 0), 1)
if (y1 >= bbox['bbox'][1] and y2 <= bbox['bbox'][3]) and (x1 >= bbox['bbox'][0] and x2 <= bbox['bbox'][2]): # 判定公式在哪一个block
in_block += 1
intersect = []
# ## 逐个span###
for line in bbox['lines']:
if line['bbox'][1] <= ((y2 - y1) / 2) + y1 <= line['bbox'][3]: # 判断公式在哪一行
for item in line['spans']:
(t_x1, t_y1, t_x2, t_y2) = item['bbox']
if not ((t_x1 < x1 and t_x2 < x1) or (t_x1 > x2 and t_x2 > x2) or (t_y1 < y1 and t_y2 < y1) or (t_y1 > y2 and t_y2 > y2)): # 判断是否相交
intersect.append(item['bbox'])
# image = cv2.rectangle(image, (int(t_x1), int(t_y1)), (int(t_x2), int(t_y2)), (0, 255, 0), 1) # 可视化涉及到的span
# 可视化公式的分类
if len(intersect) == 0: # 没有与任何一个span有相交这个span或者这个inline_formula_box可能有问题
# print(f'Wrong location, check {img_path}')
inline_formula_check_result = "nocheck_inline_formula"
# count['not_in_line'] += 1
elif len(intersect) == 1:
if abs((intersect[0][2] - intersect[0][0]) - (x2 - x1)) < (x2 - x1)*0.5: # 只涉及一个span但是几乎占据了这个span大部分的面积判定为公式
# image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 1)
inline_formula_check_result = "true_inline_formula"
# count['one_span_large'] += 1
else: # 只涉及一个span并且只占据这个span的小部分面积判断可能不是公式
# image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 1)
inline_formula_check_result = "false_inline_formula"
# count['fail'] += 1
else: # 横跨多个span,判定为公式
# image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 1)
inline_formula_check_result = "true_inline_formula"
# count['multi_span'] += 1
if in_block == 0: # 这个公式没有在任何的block里这个公式可能有问题
# image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 255, 0), 1)
inline_formula_check_result = "nocheck_inline_formula"
# count['not_in_block'] += 1
elif in_block > 1: # 这个公式存在于多个block里这个页面可能有问题
inline_formula_check_result = "wrong_text_block"
inline_formula_check.append(inline_formula_check_result)
return inline_formula_check