MinerU/pdf2text_getNumberOfColumn.py

from libs.commons import fitz
from typing import List


def show_image(item, title=""):
    """Display a pixmap.

    Just to display Pixmap image of "item" - ignore the man behind the curtain.

    Args:
        item: any PyMuPDF object having a "get_pixmap" method.
        title: a string to be used as image title

    Generates an RGB Pixmap from item using a constant DPI and using matplotlib
    to show it inline of the notebook.
    """
    DPI = 150  # use this resolution
    import numpy as np
    import matplotlib.pyplot as plt

    # %matplotlib inline
    pix = item.get_pixmap(dpi=DPI)
    img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
    plt.figure(dpi=DPI)  # set the figure's DPI
    plt.title(title)  # set title of image
    _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))


def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
    # 计算两个line，重叠line各占2个line长度的比例
    if max(L1, L2) > min(R1, R2):
        return 0, 0
    if L1 == R1 or L2 == R2:
        return 0, 0
    overlap_line = min(R1, R2) - max(L1, L2)
    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)


def get_targetAxis_and_splitAxis(page_ID: int, page: fitz.Page, columnNumber: int, textBboxs: List[(float, float, float, float)]) -> (List[float], List[float]):
    """
    param: page: fitz解析出来的格式
    param: columnNumber: Text的列数
    param: textBboxs: 文本块list。 [(L, U, R, D), ... ]
    return:

    """
    INF = 10 ** 9
    pageL, pageU, pageR, pageD = INF, INF, 0, 0
    for L, U, R, D in textBboxs:
        assert L <= R and U <= D
        pageL = min(pageL, L)
        pageR = max(pageR, R)
        pageU = min(pageU, U)
        pageD = max(pageD, D)

    pageWidth = pageR - pageL
    pageHeight = pageD - pageU
    pageL -= pageWidth / 10  # 10是经验值
    pageR += pageWidth / 10
    pageU -= pageHeight / 10
    pageD += pageHeight / 10
    pageWidth = pageR - pageL
    pageHeight = pageD - pageU

    x_targetAxis = []
    x_splitAxis = []
    for i in range(0, columnNumber * 2 + 1):
        if i & 1:
            x_targetAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
        else:
            x_splitAxis.append(pageL + pageWidth / (2 * columnNumber) * i)

    # # 可视化：分列的外框
    # path_bbox = []
    # N = len(x_targetAxis)
    # for i in range(N):
    #     L, R = x_splitAxis[i], x_splitAxis[i + 1]
    #     path_bbox.append((L, pageU, R, pageD))
    # shape = page.new_shape()
    # # iterate over the bboxes
    # color_map = [fitz.pdfcolor["red"], fitz.pdfcolor["blue"], fitz.pdfcolor["yellow"], fitz.pdfcolor["black"], fitz.pdfcolor["green"], fitz.pdfcolor["brown"]]
    # for i, rect in enumerate(path_bbox):
    #     # if i < 20:
    #     #     continue
    #     shape.draw_rect(rect)  # draw a border
    #     shape.insert_text(Point(rect[0], rect[1])+(5, 15), str(i), color=fitz.pdfcolor["blue"])
    #     shape.finish(color=color_map[i%len(color_map)])
    #     # shape.finish(color=fitz.pdfcolor["blue"])
    #     shape.commit()  # store to the page

    #     # if i == 3:
    #     #     print(rect)
    #     #     break
    #     # print(rect)
    # show_image(page, f"Table & Header BBoxes")

    return x_targetAxis, x_splitAxis


def calculate_loss(page_ID: int, x_targetAxis: List[float], x_splitAxis: List[float], textBboxs: List[(float, float, float, float)]) -> (float, bool):
    INF = 10 ** 9

    # page_artbox = page.artbox
    # pageL, pageU, pageR, pageD = page_artbox[0], page_artbox[1], page_artbox[2], page_artbox[3]

    pageL, pageU, pageR, pageD = INF, INF, 0, 0
    for L, U, R, D in textBboxs:
        assert L <= R and U <= D
        pageL = min(pageL, L)
        pageR = max(pageR, R)
        pageU = min(pageU, U)
        pageD = max(pageD, D)

    pageWidth = pageR - pageL
    pageHeight = pageD - pageU
    pageL -= pageWidth / 10
    pageR += pageWidth / 10
    pageU -= pageHeight / 10
    pageD += pageHeight / 10
    pageWidth = pageR - pageL
    pageHeight = pageD - pageU

    col_N = len(x_targetAxis)  # 列数
    col_texts_mid = [[] for _ in range(col_N)]
    col_texts_LR = [[] for _ in range(col_N)]

    oneLocateLoss_mid = 0
    oneLocateLoss_LR = 0
    oneLocateCnt_mid = 0  # 完美在一列中的个数
    oneLocateCnt_LR = 0
    oneLocateSquare_mid = 0.0  # 完美在一列的面积
    oneLocateSquare_LR = 0.0

    multiLocateLoss_mid = 0
    multiLocateLoss_LR = 0
    multiLocateCnt_mid = 0  # 在多列中的个数
    multiLocateCnt_LR = 0
    multiLocateSquare_mid = 0.0  # 在多列中的面积
    multiLocateSquare_LR = 0.0

    allLocateLoss_mid = 0
    allLocateLoss_LR = 0
    allLocateCnt_mid = 0  # 横跨页面的大框的个数
    allLocateCnt_LR = 0
    allLocateSquare_mid = 0.0  # 横跨整个页面的个数
    allLocateSquare_LR = 0.0

    isSimpleCondition = True  # 就1个。2种方式，只要有一种情况不规整，就是不规整。
    colID_Textcnt_mid = [0 for _ in range(col_N)]  # 每一列中有多少个Text块，根据mid判断的
    colID_Textcnt_LR = [0 for _ in range(col_N)]  # 每一列中有多少个Text块，根据区间边界判断

    allLocateBboxs_mid = []  # 跨整页的，bbox
    allLocateBboxs_LR = []
    non_allLocateBboxs_mid = []
    non_allLocateBboxs_LR = []  # 不在单独某一列，但又不是全列
    for L, U, R, D in textBboxs:
        if D - U < 40:  # 现在还没拼接好。先简单这样过滤页眉。也会牺牲一些很窄的长条
            continue
        if R - L < 40:
            continue
        located_cols_mid = []
        located_cols_LR = []
        for col_ID in range(col_N):
            if col_N == 1:
                located_cols_mid.append(col_ID)
                located_cols_LR.append(col_ID)
            else:
                if L <= x_targetAxis[col_ID] <= R:
                    located_cols_mid.append(col_ID)
                if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
                    located_cols_LR.append(col_ID)

        if len(located_cols_mid) == col_N:
            allLocateBboxs_mid.append((L, U, R, D))
        else:
            non_allLocateBboxs_mid.append((L, U, R, D))
        if len(located_cols_LR) == col_N:
            allLocateBboxs_LR.append((L, U, R, D))
        else:
            non_allLocateBboxs_LR.append((L, U, R, D))

    allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
    non_allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
    allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
    non_allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))

    # --------------------判断，是不是有标题类的小块，掺杂在一列的pdf页面里。-------------#
    isOneClumn = False
    under_cnt = 0
    under_square = 0.0
    before_cnt = 0
    before_square = 0.0
    for nL, nU, nR, nD in non_allLocateBboxs_mid:
        cnt = 0
        for L, U, R, D in allLocateBboxs_mid:
            if nD <= U:
                cnt += 1
        if cnt >= 1:
            before_cnt += cnt
            before_square += (R - L) * (D - U) * cnt
        else:
            under_cnt += 1
            under_square += (R - L) * (D - U) * cnt

    if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
        isOneClumn = True

    if isOneClumn == True and col_N != 1:
        return INF, False
    if isOneClumn == True and col_N == 1:
        return 0, True
    #### 根据边界的统计情况，再判断一次
    isOneClumn = False
    under_cnt = 0
    under_square = 0.0
    before_cnt = 0
    before_square = 0.0
    for nL, nU, nR, nD in non_allLocateBboxs_LR:
        cnt = 0
        for L, U, R, D in allLocateBboxs_LR:
            if nD <= U:
                cnt += 1
        if cnt >= 1:
            before_cnt += cnt
            before_square += (R - L) * (D - U) * cnt
        else:
            under_cnt += 1
            under_square += (R - L) * (D - U) * cnt

    if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
        isOneClumn = True

    if isOneClumn == True and col_N != 1:
        return INF, False
    if isOneClumn == True and col_N == 1:
        return 0, True

    for L, U, R, D in textBboxs:
        assert L < R and U < D, 'There is an error on bbox of text when calculate loss!'

        # 简单排除页眉、迷你小块
        # if (D - U) < pageHeight / 15 < 40 or (R - L) < pageWidth / 8:
        if (D - U) < 40:
            continue
        if (R - L) < 40:
            continue
        mid = (L + R) / 2
        located_cols_mid = []  # 在哪一列里，根据中点来判断
        located_cols_LR = []  # 在哪一列里，根据边界判断
        for col_ID in range(col_N):
            if col_N == 1:
                located_cols_mid.append(col_ID)
            else:
                # 根据中点判断
                if L <= x_targetAxis[col_ID] <= R:
                    located_cols_mid.append(col_ID)
                # 根据边界判断
                if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
                    located_cols_LR.append(col_ID)

        ## 1列的情形
        if col_N == 1:
            oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
            # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
            oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_mid[0]]) * (D - U) * (R - L)
            oneLocateCnt_mid += 1
            oneLocateSquare_mid += (D - U) * (R - L)
        ## 多列的情形
        else:
            ######## 根据mid判断
            if len(located_cols_mid) == 1:
                oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
                # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
                oneLocateCnt_mid += 1
                oneLocateSquare_mid += (D - U) * (R - L)
            elif 1 <= len(located_cols_mid) < col_N:
                ll, rr = located_cols_mid[0], located_cols_mid[-1]
                # multiLocateLoss_mid += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
                multiLocateLoss_mid += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
                # multiLocateLoss_mid += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
                multiLocateCnt_mid += 1
                multiLocateSquare_mid += (D - U) * (R - L)
                isSimpleCondition = False
            else:
                allLocateLoss_mid += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
                allLocateCnt_mid += 1
                allLocateSquare_mid += (D - U) * (R - L)
                isSimpleCondition = False

            ######## 根据区间的边界判断
            if len(located_cols_LR) == 1:
                oneLocateLoss_LR += abs(mid - x_targetAxis[located_cols_LR[0]]) * (D - U) * (R - L)
                # oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_LR[0]]) * (D - U) * (R - L)
                oneLocateCnt_LR += 1
                oneLocateSquare_LR += (D - U) * (R - L)
            elif 1 <= len(located_cols_LR) < col_N:
                ll, rr = located_cols_LR[0], located_cols_LR[-1]
                # multiLocateLoss_LR += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
                multiLocateLoss_LR += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
                # multiLocateLoss_LR += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
                multiLocateCnt_LR += 1
                multiLocateSquare_LR += (D - U) * (R - L)
                isSimpleCondition = False
            else:
                allLocateLoss_LR += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
                allLocateCnt_LR += 1
                allLocateSquare_LR += (D - U) * (R - L)
                isSimpleCondition = False

    tot_TextCnt = oneLocateCnt_mid + multiLocateCnt_mid + allLocateCnt_mid
    tot_TextSquare = oneLocateSquare_mid + multiLocateSquare_mid + allLocateSquare_mid

    # 1列的情形
    if tot_TextSquare != 0 and allLocateSquare_mid / tot_TextSquare >= 0.85 and col_N == 1:
        return 0, True

    # 多列的情形

    # if col_N >= 2:
    #     if allLocateCnt >= 1:
    #         oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
    #         multiLocateLoss_mid += ((pageR - pageL) ) * multiLocateCnt_mid
    #     else:
    #         if multiLocateCnt_mid >= 1:
    #             oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
    totLoss_mid = oneLocateLoss_mid + multiLocateLoss_mid + allLocateLoss_mid
    totLoss_LR = oneLocateCnt_LR + multiLocateCnt_LR + allLocateLoss_LR
    return totLoss_mid + totLoss_LR, isSimpleCondition


def get_columnNumber(page_ID: int, page: fitz.Page, textBboxs) -> (int, float):
    columnNumber_loss = dict()
    columnNumber_isSimpleCondition = dict()
    #### 枚举列数
    for columnNumber in range(1, 5):
        # print('---------{}--------'.format(columnNumber))
        x_targetAxis, x_splitAxis = get_targetAxis_and_splitAxis(page_ID, page, columnNumber, textBboxs)
        loss, isSimpleCondition = calculate_loss(page_ID, x_targetAxis, x_splitAxis, textBboxs)
        columnNumber_loss[columnNumber] = loss
        columnNumber_isSimpleCondition[columnNumber] = isSimpleCondition

    col_idxs = [i for i in range(1, len(columnNumber_loss) + 1)]
    col_idxs.sort(key=lambda i: (columnNumber_loss[i], i))

    return col_idxs, columnNumber_loss, columnNumber_isSimpleCondition