Files
MinerU/pdf2text_recogFootnoteLine.py
2024-02-29 16:53:41 +08:00

674 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import io
import re
import os
import json
from libs.boxbase import _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from libs.commons import fitz
from fitz import Point
from pprint import pprint
import pickle
import collections
from typing import List
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
# 计算两个rect重叠面积各占2个rect面积的比例
if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
return 0, 0
square_1 = (R1 - L1) * (D1 - U1)
square_2 = (R2 - L2) * (D2 - U2)
if square_1 == 0 or square_2 == 0:
return 0, 0
square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
return square_overlap / square_1, square_overlap / square_2
def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
# 计算两个line重叠区间各占2个line长度的比例
if max(L1, L2) > min(R1, R2):
return 0, 0
if L1 == R1 or L2 == R2:
return 0, 0
overlap_line = min(R1, R2) - max(L1, L2)
return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
def parse_footnoteLine(page_ID: int, page: fitz.Page, json_from_DocXchain_obj, exclude_bboxes):
"""
:param page_ID: int类型当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型是每一个pdf文档在当前.py文件的目录下生成一个与pdf文档同名的文件夹res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型把pdf文档送入DocXChain模型中后提取bbox结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
"""
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#---------------------- PyMuPDF解析text --------------------#
textSize_freq = collections.defaultdict(float) # text块中textSize的频率
textBlock_bboxs = []
textLine_bboxs = []
text_blocks = page.get_text(
"dict",
flags=fitz.TEXTFLAGS_TEXT,
#clip=clip,
)["blocks"]
totText_list = []
for i in range(len(text_blocks)):
# print(blocks[i]) #### print
bbox = text_blocks[i]['bbox']
textBlock_bboxs.append(bbox)
# print(bbox)
cur_block_text_list = []
for tt in text_blocks[i]['lines']:
# 当前line
cur_line_text_list = []
cur_line_bbox = None # 当前line最右侧的section的bbox
for xf in tt['spans']:
L, U, R, D = xf['bbox']
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
textLine_bboxs.append((L, U, R, D))
cur_line_text_list.append(xf['text'])
textSize_freq[xf['size']] += len(xf['text'])
cur_lines_text = ' '.join(cur_line_text_list)
cur_block_text_list.append(cur_lines_text)
totText_list.append('\n'.join(cur_block_text_list))
totText = '\n'.join(totText_list)
# print(totText) # 打印Text
textLine_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
textBlock_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
# print('------------ textSize_freq -----------')
max_sizeFreq = 0 # 出现频率最高的textSize
textSize_withMaxFreq = 0
for x, f in textSize_freq.items():
# print(x, f)
if f > max_sizeFreq:
max_sizeFreq = f
textSize_withMaxFreq = x
#**********************************************************#
#------------------ PyMuPDF读取drawings -----------------#
horizon_lines = []
drawings = page.get_cdrawings()
for drawing in drawings:
try:
rect = drawing['rect']
L, U, R, D = rect
# if (L, U, R, D) in exclude_bboxes:
# continue # 如果是Fiugre, Table, Equation。注释掉是因为可以暂时先不消先自我对消。最后再判读需不需要排除。
# 如果是水平线
if U <= D and D - U <= 3:
# 如果长度够
if (pageR - pageL) / 15 <= R - L:
if not(80/800 * pageD <= U <= 750/800 * pageD):
continue # 很可能是页眉和页脚的线
horizon_lines.append((L, U, R, D))
# print((L, U, R, D))
except:
pass
horizon_lines.sort(key = lambda LURD: (LURD[1]))
#********************************************************#
#----------------- 两条线可能是在表格中 ------------------#
def has_text_below_line(L: float, U: float, R: float, D: float, inLowerArea: bool) -> bool:
"""
检查线下是否紧挨着text
"""
Uu, Du = U - textSize_withMaxFreq, U # 线上的一个矩形
Lu, Ru = L, R
Ud, Dd = U, U + textSize_withMaxFreq # 线下的一个矩形
Ld, Rd = L, R
find = 0 # 在线下的文字。统计面积。
leftTextCnt = 0 # 不在线底下的文字(整体在线左侧的文字),说明不是个脚注线。统计面积。
English_alpha_cnt = 0 # 英文字母个数
nonEnglish_alpha_cnt = 0 # 非英文字母个数
punctuation_mark_cnt = 0 # 常见标点符号个数
digit_cnt = 0 # 数字个数
distance_nearest_up_line = None
distance_nearest_down_line = None
for i in range(len(text_blocks)):
# print(blocks[i]) #### print
bbox = text_blocks[i]['bbox']
L0, U0, R0, D0 = bbox
if 0< (R0 - L0) < pageR / 6 and (D0 - U0) / (R0 - L0) > 10 :
continue # 一个很窄的竖直的长条。比如arXiv预印本左侧的arXiv标志信息。
textBlock_bboxs.append(bbox)
# print(bbox)
cur_block_text_list = []
for tt in text_blocks[i]['lines']:
# 当前line
cur_line_text_list = []
cur_line_bbox = None # 当前line最右侧的section的bbox
for xf in tt['spans']:
L2, U2, R2, D2 = xf['bbox']
L2, R2 = min(L2, R2), max(L2, R2)
U2, D2 = min(U2, D2), max(U2, D2)
textLine = xf['text']
if L>0 and L2 < L and (L - L2) / L > 0.2:
leftTextCnt += abs(R2 - L2) * abs(D2 - U2)
else:
## 线下的部分
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(Ud, Dd, U2, D2)
ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(Ld, Rd, L2, R2)
if U < (U2 + D2) / 2 and ratio_1 > 0 and ratio_2 > 0:
if max(ratio_3, ratio_4) > 0.8:
# if 444 <= U1 < 445 and 55 <= L2 < 56:
# print('匹配的框', L2, U2, R2, D2)
# if xf['size'] > 1.2 * textSize_withMaxFreq:
# return False # 可能是个标题。不能这样卡
find += abs(R2 - L2) * abs(D2 - U2)
distance_nearest_down_line = (U2 + D2) / 2 - U
for c in textLine:
if c == ' ':
continue
elif c.isdigit() == True:
digit_cnt += 1
elif c in ',.:!?[]()%,。、!?:【】()《》-':
punctuation_mark_cnt += 1
elif c.isalpha() == True:
English_alpha_cnt += 1
else:
nonEnglish_alpha_cnt += 1
## 线上的部分
ratio_5, ratio_6 = calculate_overlapRatio_between_line1_and_line2(Uu, Du, U2, D2)
ratio_7, ratio_8 = calculate_overlapRatio_between_line1_and_line2(Lu, Ru, L2, R2)
if (U2 + D2) / 2 < U and ratio_5 > 0 and ratio_6 > 0:
if max(ratio_7, ratio_8) > 0.8:
distance_nearest_up_line = U - (U2 + D2) / 2
# if distance_nearest_up_line < 0:
# print(Lu, Uu, Ru, Du, L2, U2, R2, D2)
# print(distance_nearest_up_line, distance_nearest_down_line)
if distance_nearest_up_line != None and distance_nearest_down_line != None:
if distance_nearest_up_line * 1.5 < distance_nearest_down_line:
return False # 如果一根线。距离上面的文字line更近。说明是个下划线而不是footnoteLine
## 在上面的线条要考虑左侧的text块儿。在很靠下的线条就暂时不考虑左侧text块儿了。
if inLowerArea == False:
if leftTextCnt >= 2000/500000 * pageR * pageD:
return False
return find >= 0 and (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) >= 10
## 最下面区域的线条,判断时。
# print(English_alpha_cnt, nonEnglish_alpha_cnt, digit_cnt)
if (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) == 0:
return False
if (English_alpha_cnt + digit_cnt) / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.5:
if nonEnglish_alpha_cnt / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.4:
return False
else:
return True
return True
visited = [False for _ in range(len(horizon_lines))]
for i, b1 in enumerate(horizon_lines):
for j in range(i + 1, len(horizon_lines)):
L1, U1, R1, D1 = horizon_lines[i]
L2, U2, R2, D2 = horizon_lines[j]
## 在一条水平线,且挨着
if L1 > L2:
L1, U1, R1, D1, L2, U2, R2, D2 = L2, U2, R2, D2, L1, U1, R1, D1
in_horizontal_line_flag = (max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5) and (L2 - R1 <= pageR/10)
if in_horizontal_line_flag == True:
visited[i] = True
visited[j] = True
## 在竖直方向上是一致的。(表格,或者有的文章就是喜欢划线)
L1, U1, R1, D1 = horizon_lines[i]
L2, U2, R2, D2 = horizon_lines[j]
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
# print(L1, U1, R1, D1, L2, U2, R2, D2, ratio_1, ratio_2)
in_vertical_line_flag = (ratio_1 > 0.9 and ratio_2 > 0.9) or (max(ratio_1, ratio_2) > 0.95)
if in_vertical_line_flag == True:
visited[i] = True
# if (U2 < pageD * 0.8 or (U2 - U1) < pageD * 0.3) and has_text_below_line(L2, U2, R2, D2, False) == False:
# visited[j] = True # 最最底下的线先不要动
else:
if ratio_1 > 0 and (R2 - L2) / (R1 - L1) > 1:
visited[i] = True
# print(horizon_lines)
horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
# print(horizon_lines)
#*****************************************************************#
#------- 靠上的就不是脚注。用一个THRESHOLD直接卡掉位于上半页的 -------#
visited = [False for _ in range(len(horizon_lines))]
THRESHOLD = (pageD - pageU) * 0.5
for i, (L, U, R, D) in enumerate(horizon_lines):
if U < THRESHOLD:
visited[i] = True
horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
#******************************************************#
#--------------- 此时,还有遮挡的,上面的丢弃 ---------------#
visited = [False for _ in range(len(horizon_lines))]
for i, (L1, U1, R1, D1) in enumerate(horizon_lines):
for j in range(i + 1, len(horizon_lines)):
L2, U2, R2, D2 = horizon_lines[j]
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
if (ratio_1 > 0.2 and ratio_2 > 0.2) or max(ratio_1, ratio_2) > 0.7:
visited[i] = True
horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
#********************************************************#
# print(horizon_lines)
## 检查线下面有没有紧挨着的text
horizon_lines = [LURD for LURD in horizon_lines if has_text_below_line(*(LURD), True) == True]
# print(horizon_lines)
## 卡一下长度
# horizon_lines = [LURD for LURD in horizon_lines if (LURD[2] - LURD[0] >= pageR / 10)]
## 上面最多保留2条
horizon_lines = horizon_lines[max(-2, -len(horizon_lines)) :]
#----------------------------------------------------- 第2段 -----------------------------------------------------------#
#----------------------------------- 最下面的情形,用距离硬卡。还有在右侧的情形就被包含了 -----------------------------------#
#------------------ PyMuPDF读取drawings -----------------#
down_horizon_lines = []
drawings = page.get_cdrawings()
for drawing in drawings:
try:
rect = drawing['rect']
L, U, R, D = rect
# if (L, U, R, D) in exclude_bboxes:
# continue # 如果是Fiugre, Table, Equation。目前是Figure识别的比较好。但是Table和Equation识别的不好
# 如果是水平线
if U <= D and D - U <= 3 and U > pageD * 0.85:
# 如果长度够
if (pageR - pageL) / 15 <= R - L:
down_horizon_lines.append((L, U, R, D))
# print((L, U, R, D))
except:
pass
down_horizon_lines.sort(key = lambda LURD: (LURD[0], LURD[2], LURD[1]))
visited = [False for _ in range(len(down_horizon_lines))]
for i in range(len(down_horizon_lines) - 1):
L1, U1, R1, D1 = down_horizon_lines[i]
L2, U2, R2, D2 = down_horizon_lines[i + 1]
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
if ratio_1 <= 0.1 and ratio_2 <= 0.1:
if L2 - R1 <= pageR / 3:
visited[i] = True
visited[i + 1] = True
down_horizon_lines = [down_horizon_lines[i] for i in range(len(down_horizon_lines)) if visited[i] == False]
down_horizon_lines = [LURD for LURD in down_horizon_lines if has_text_below_line(*(LURD), True) == True]
# for LURD in down_horizon_lines:
# print('第2阶段LURD是 ', LURD)
# print(has_text_below_line(*(LURD), True))
footnoteLines = horizon_lines + down_horizon_lines
footnoteLines = list(set(footnoteLines))
footnoteLines = footnoteLines[max(-2, -len(footnoteLines)) : ]
#-------------------------- 最后再检查一遍。是否在图片、表格、公式中。 ------------------------------#
def line_in_specialBboxes(L: float, U: float, R: float, D: float, specialBboxes) -> bool:
L2, U2, R2, D2 = L, U, R, D # 当前这根线
for L1, U1, R1, D1 in specialBboxes:
if U1 <= U2 <= D2 < D1:
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
if ratio_1 > 0 and ratio_2 > 0.6:
return True
# else:
# U1 -= min(textSize_withMaxFreq * 2, 20)
# D1 += min(textSize_withMaxFreq * 2, 20)
# if U1 <= U2 <= D2 < D1:
# ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
# if ratio_1 > 0 and ratio_2 > 0.8:
# return True
return False
footnoteLines = [LURD for LURD in footnoteLines if line_in_specialBboxes(*(LURD), exclude_bboxes) == False]
#-------------------------- 检查线是否在当前column的左侧而不是在一段文字的中间 通过DocXChain识别的column或者徐超老师写的Layout识别------------------------------#
# #--------- 通过json_from_DocXchain来获取 column ---------#
# column_bbox_from_DocXChain = []
# xf_json = json_from_DocXchain_obj
# width_from_json = xf_json['page_info']['width']
# height_from_json = xf_json['page_info']['height']
# LR_scaleRatio = width_from_json / (pageR - pageL)
# UD_scaleRatio = height_from_json / (pageD - pageU)
# # {0: 'title', # 标题
# # 1: 'figure', # 图片
# # 2: 'plain text', # 文本
# # 3: 'header', # 页眉
# # 4: 'page number', # 页码
# # 5: 'footnote', # 脚注
# # 6: 'footer', # 页脚
# # 7: 'table', # 表格
# # 8: 'table caption', # 表格描述
# # 9: 'figure caption', # 图片描述
# # 10: 'equation', # 公式
# # 11: 'full column', # 单栏
# # 12: 'sub column', # 多栏
# # 13: 'embedding', # 嵌入公式
# # 14: 'isolated'} # 单行公式
# for xf in xf_json['layout_dets']:
# L = xf['poly'][0] / LR_scaleRatio
# U = xf['poly'][1] / UD_scaleRatio
# R = xf['poly'][2] / LR_scaleRatio
# D = xf['poly'][5] / UD_scaleRatio
# # L += pageL # 有的页面artBox偏移了。不在0,0
# # R += pageL
# # U += pageU
# # D += pageU
# L, R = min(L, R), max(L, R)
# U, D = min(U, D), max(U, D)
# if (xf['category_id'] == 11 or xf['category_id'] == 12) and xf['score'] >= 0.3:
# column_bbox_from_DocXChain.append((L, U, R, D))
#---------------手写检查线是否是与某个column的左端对齐 ------------------#
def check_isOnTheLeftOfColumn(L: float, U: float, R: float, D: float) -> bool:
LL = L - textSize_withMaxFreq
RR = LL
UU = max(pageD * 0.02, U - 100/800 * pageD)
DD = min(U + 50/800 * pageD, pageD * 0.98)
# print(LL, UU, RR, DD)
cnt = 0
for bbox in textLine_bboxs:
L2, U2, R2, D2 = bbox
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(UU, DD, U2, D2)
ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
if ratio_1 > 0 and ratio_2 > 0:
if max(ratio_3, ratio_4) > 0.8:
if abs(LL - L2) <= 20/700 * pageR:
cnt += 1
# else:
# if (R2 - L2) >= 30/700 * pageR:
# print(LL, UU, RR, DD, L2, U2, R2, D2)
# return False # 不能这样卡。有些注释里面单独的特殊符号就是一个textLineBbox
# print('cnt: ', cnt)
return cnt >= 4
# def check_isOnTheLeftOfColumn_considerLayout(L0: float, U0: float, R0: float, D0: float) -> bool:
# LL = L0 - textSize_withMaxFreq * 1.5
# RR = LL
# UU = 100/800 * pageD
# DD = 700/800 * pageD
# STEP = textSize_withMaxFreq / 2
# def check_ok(L: float, U: float, R: float, D: float) -> bool:
# for bbox in textBlock_bboxs:
# L2, U2, R2, D2 = bbox
# ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
# if max(ratio_3, ratio_4) > 0.8:
# if (R2 - L2) > 1/4 * pageR and L2 < LL <= RR < R2:
# if abs(LL - L2) < 50/700 * pageR or abs(RR - R2) < 50/700 * pageR:
# continue
# else:
# return False
# return True
# ## 先探上面
# u = UU
# d = U0
# while u + STEP/2 < d:
# mid = (u + d) / 2
# if check_ok(L0, mid, R0, U0) == True:
# d = mid
# else:
# u = mid + STEP
# print(mid)
# dist_up = U0 - u
# print(u)
# ## 再探下面
# u = D0
# d = DD
# while u + STEP/2 < d:
# mid = (u + d) / 2
# if check_ok(L0, mid, R0, D0) == True:
# u = mid
# else:
# d = mid - STEP
# print(u)
# print('^^^^^^^^^^^^^^')
# dist_down = u - D0
# if dist_up + dist_down < textSize_withMaxFreq * 10:
# return False
# return True
footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn(*(LURD)) == True]
# footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn_considerLayout(*(LURD)) == True] # 不具有泛化性。不用了。
#--------------------------------- 通过footnoteLine获取bbox -------------------------------#
def get_footnoteBbox(L: float, U: float, R: float, D: float) -> (float, float, float, float):
"""
检查线下是否紧挨着text
"""
L1, U1, R1, D1 = L, U, R, D
raw_bboxes = []
for i in range(len(text_blocks)):
bbox = text_blocks[i]['bbox']
L2, U2, R2, D2 = bbox
if (D2 - U2) / (R2 - L2) > 10 and (R2 - L2) < pageR / 6:
continue # 一个很窄的竖直的长条。比如arXiv预印本左侧的arXiv标志信息。
if U2 < D2 < U1:
continue # 在线上面
under_THRESHOLD = min(D1 + textSize_withMaxFreq * 20, pageD * 0.98)
if U2 < under_THRESHOLD:
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
if max(ratio_1, ratio_2) > 0.8:
raw_bboxes.append((L2, U2, R2, D2))
# print(L1, U1, R1, D1)
# print(raw_bboxes)
if len(raw_bboxes) == 0:
return []
raw_bboxes.sort(key = lambda LURD: (LURD[1], LURD[0]))
raw_bboxes = [LURD for LURD in raw_bboxes if (abs(LURD[0] - L1) < textSize_withMaxFreq * 6 or L1 < LURD[0])] # footnote的bbox应该都是左端对齐的
if len(raw_bboxes) == 0:
return []
#------------------ full column和sub column混合肯定也不行 ------------------#
LL, UU, RR, DD = raw_bboxes[0]
for L, U, R, D in raw_bboxes:
LL, UU, RR, DD = min(LL, L), min(UU, U), max(RR, R), max(DD, D)
for L, U, R, D in raw_bboxes:
if (RR - LL) > pageR*0.8 and (R - L) > pageR * 0.15 and (RR - LL) / (R - L) > 2:
return []
if abs(LL - L) > textSize_withMaxFreq * 3:
return []
#-------------------- 太高了的full column的框。不行 ----------------------#
if UU < 650/800 * pageD and (RR - LL) > 0.5 * pageR:
return []
#-------------- 第一段字数很少。后面的段字数很多,也不行 ----------------#
if len(raw_bboxes) > 1:
bbox_square = []
for L, U, R, D in raw_bboxes:
cur_s = abs(R - L) * abs(D - U)
bbox_square.append(cur_s)
s0 = bbox_square[0]
s1n = sum(bbox_square[1: ]) / len(bbox_square[1: ])
if s1n / s0 > 10 or max(bbox_square) / s0 > 15:
return []
raw_bboxes += [(LL, UU, RR, DD)]
return raw_bboxes
# print(footnoteLines)
footnoteBboxes = []
for L, U, R, D in footnoteLines:
cur = get_footnoteBbox(L, U, R, D)
if len(cur) > 0:
footnoteBboxes.append((L, U, R, D))
footnoteBboxes += cur
footnoteBboxes = list(set(footnoteBboxes))
return footnoteBboxes
def __bbox_in(box1, box2):
"""
box1是否在box2中
"""
L1, U1, R1, D1 = box1
L2, U2, R2, D2 = box2
if int(L2) <= int(L1) and int(U2) <= int(U1) and int(R1) <= int(R2) and int(D1) <= int(D2):
return True
return False
def remove_footnote_text(raw_text_block, footnote_bboxes):
"""
:param raw_text_block: str类型是当前页的文本内容
:param footnoteBboxes: list类型是当前页的脚注bbox
"""
footnote_text_blocks = []
for block in raw_text_block:
text_bbox = block['bbox']
# TODO 更严谨点在line级别做
if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
#if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
block['tag'] = 'footnote'
footnote_text_blocks.append(block)
#raw_text_block.remove(block)
# 移除,不能再内部移除,否则会出错
for block in footnote_text_blocks:
raw_text_block.remove(block)
return raw_text_block, footnote_text_blocks
def remove_footnote_image(image_blocks, footnote_bboxes):
"""
:param image_bboxes: list类型是当前页的图片bbox(结构体)
:param footnoteBboxes: list类型是当前页的脚注bbox
"""
footnote_imgs_blocks = []
for image_block in image_blocks:
if any([__bbox_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
footnote_imgs_blocks.append(image_block)
for footnote_imgs_block in footnote_imgs_blocks:
image_blocks.remove(footnote_imgs_block)
return image_blocks, footnote_imgs_blocks
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, page_no_bboxs, page_w, page_h):
"""
删除页眉页脚,页码
从line级别进行删除删除之后观察这个text-block是否是空的如果是空的则移动到remove_list中
"""
header = []
footer = []
if len(header)==0:
model_header = header_bboxs
if model_header:
x0 = min([x for x,_,_,_ in model_header])
y0 = min([y for _,y,_,_ in model_header])
x1 = max([x1 for _,_,x1,_ in model_header])
y1 = max([y1 for _,_,_,y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer)==0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x,_,_,_ in model_footer])
y0 = min([y for _,y,_,_ in model_footer])
x1 = max([x1 for _,_,x1,_ in model_footer])
y1 = max([y1 for _,_,_,y1 in model_footer])
footer = [x0, y0, x1, y1]
header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h/2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h/2]
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)
content_boundry = [0, header_y0, page_w, footer_y0]
header = [0,0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk['lines']) > 0:
for line in blk['lines']:
line_del = []
for span in line['spans']:
span_del = []
if span['bbox'][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
span_del.append(span)
for span in span_del:
line['spans'].remove(span)
if not line['spans']:
line_del.append(line)
for line in line_del:
blk['lines'].remove(line)
else:
# if not blk['lines']:
blk['tag'] = 'in-foot-header-area'
text_block_to_remove.append(blk)
"""有的时候由于pageNo太小了总是会有一点和content_boundry重叠一点被放入正文因此对于pageNo进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
for line in block['lines']:
for span in line['spans']:
if _is_in_or_part_overlap(pagenobox, span['bbox']):
#span['text'] = ''
span['tag'] = "page-no"
# 检查这个block是否只有这一个span如果是那么就把这个block也删除
if len(line['spans']) == 1 and len(block['lines'])==1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码规则是最后一个block仅有1个line,一个span,且text是数字空格符号组成不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block['lines']) == 1:
last_line = last_block['lines'][0]
if len(last_line['spans']) == 1:
last_span = last_line['spans'][0]
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]', last_span['text']):
last_span['tag'] = "page-no"
page_no_block_2_remove.append(last_block)
for b in page_no_block_2_remove:
text_block_to_remove.append(b)
for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)
text_block_remain = text_raw_blocks
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove