Files
MinerU/pdf2text_recogEquation.py
2024-02-29 16:53:41 +08:00

105 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
from pathlib import Path
def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
"""
:param page_ID: int类型当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型是每一个pdf文档在当前.py文件的目录下生成一个与pdf文档同名的文件夹res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型把pdf文档送入DocXChain模型中后提取bbox结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
"""
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#--------- 通过json_from_DocXchain来获取 table ---------#
equationEmbedding_from_DocXChain_bboxs = []
equationIsolated_from_DocXChain_bboxs = []
xf_json = json_from_DocXchain_obj
width_from_json = xf_json['page_info']['width']
height_from_json = xf_json['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
for xf in xf_json['layout_dets']:
# {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
L = xf['poly'][0] / LR_scaleRatio
U = xf['poly'][1] / UD_scaleRatio
R = xf['poly'][2] / LR_scaleRatio
D = xf['poly'][5] / UD_scaleRatio
# L += pageL # 有的页面artBox偏移了。不在0,0
# R += pageL
# U += pageU
# D += pageU
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
# equation
img_suffix = f"{page_ID}_{int(L)}_{int(U)}_{int(R)}_{int(D)}"
if xf['category_id'] == 13 and xf['score'] >= 0.3:
latex_text = xf.get("latex", "EmptyInlineEquationResult")
debugable_latex_text = f"{latex_text}|{img_suffix}"
equationEmbedding_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
if xf['category_id'] == 14 and xf['score'] >= 0.3:
latex_text = xf.get("latex", "EmptyInterlineEquationResult")
debugable_latex_text = f"{latex_text}|{img_suffix}"
equationIsolated_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
#---------------------------------------- 排序,编号,保存 -----------------------------------------#
equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
equationEmbedding_from_DocXChain_names = []
equationEmbedding_ID = 0
equationIsolated_from_DocXChain_names = []
equationIsolated_ID = 0
for L, U, R, D, _ in equationEmbedding_from_DocXChain_bboxs:
if not(L < R and U < D):
continue
try:
# cur_equation = page.get_pixmap(clip=(L,U,R,D))
new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationEmbedding_ID) # 公式name
# cur_equation.save(res_dir_path + '/' + new_equation_name) # 把公式存出在新建的文件夹,并命名
equationEmbedding_from_DocXChain_names.append(new_equation_name) # 把公式的名字存在list中方便在md中插入引用
equationEmbedding_ID += 1
except:
pass
for L, U, R, D, _ in equationIsolated_from_DocXChain_bboxs:
if not(L < R and U < D):
continue
try:
# cur_equation = page.get_pixmap(clip=(L,U,R,D))
new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationIsolated_ID) # 公式name
# cur_equation.save(res_dir_path + '/' + new_equation_name) # 把公式存出在新建的文件夹,并命名
equationIsolated_from_DocXChain_names.append(new_equation_name) # 把公式的名字存在list中方便在md中插入引用
equationIsolated_ID += 1
except:
pass
equationEmbedding_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
"""根据pdf可视区域调整bbox的坐标"""
cropbox = page.cropbox
if cropbox[0]!=page.rect[0] or cropbox[1]!=page.rect[1]:
for eq_box in equationEmbedding_from_DocXChain_bboxs:
eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
for eq_box in equationIsolated_from_DocXChain_bboxs:
eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
return equationEmbedding_from_DocXChain_bboxs, equationIsolated_from_DocXChain_bboxs