Files
MinerU/mineru/utils/pdf_text_tool.py

52 lines
1.5 KiB
Python

import math
from typing import List
import pypdfium2 as pdfium
from pdftext.pdf.chars import deduplicate_chars, get_chars
from pdftext.pdf.pages import assign_scripts, get_blocks, get_lines, get_spans
from mineru.utils.pdfium_guard import pdfium_guard
def get_page(
page: pdfium.PdfPage,
quote_loosebox: bool = True,
superscript_height_threshold: float = 0.7,
line_distance_threshold: float = 0.1,
) -> dict:
with pdfium_guard():
textpage = page.get_textpage()
page_bbox: List[float] = page.get_bbox()
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
page_rotation = 0
try:
page_rotation = page.get_rotation()
except Exception:
pass
chars = deduplicate_chars(
get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
)
spans = get_spans(
chars,
superscript_height_threshold=superscript_height_threshold,
line_distance_threshold=line_distance_threshold,
)
lines = get_lines(spans)
assign_scripts(
lines,
height_threshold=superscript_height_threshold,
line_distance_threshold=line_distance_threshold,
)
blocks = get_blocks(lines)
return {
"bbox": page_bbox,
"width": page_width,
"height": page_height,
"rotation": page_rotation,
"blocks": blocks,
}