mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 19:18:34 +07:00
52 lines
1.5 KiB
Python
52 lines
1.5 KiB
Python
import math
|
|
from typing import List
|
|
|
|
import pypdfium2 as pdfium
|
|
from pdftext.pdf.chars import deduplicate_chars, get_chars
|
|
from pdftext.pdf.pages import assign_scripts, get_blocks, get_lines, get_spans
|
|
|
|
from mineru.utils.pdfium_guard import pdfium_guard
|
|
|
|
|
|
def get_page(
|
|
page: pdfium.PdfPage,
|
|
quote_loosebox: bool = True,
|
|
superscript_height_threshold: float = 0.7,
|
|
line_distance_threshold: float = 0.1,
|
|
) -> dict:
|
|
with pdfium_guard():
|
|
textpage = page.get_textpage()
|
|
page_bbox: List[float] = page.get_bbox()
|
|
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
|
|
page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
|
|
|
|
page_rotation = 0
|
|
try:
|
|
page_rotation = page.get_rotation()
|
|
except Exception:
|
|
pass
|
|
|
|
chars = deduplicate_chars(
|
|
get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
|
|
)
|
|
spans = get_spans(
|
|
chars,
|
|
superscript_height_threshold=superscript_height_threshold,
|
|
line_distance_threshold=line_distance_threshold,
|
|
)
|
|
lines = get_lines(spans)
|
|
assign_scripts(
|
|
lines,
|
|
height_threshold=superscript_height_threshold,
|
|
line_distance_threshold=line_distance_threshold,
|
|
)
|
|
blocks = get_blocks(lines)
|
|
|
|
return {
|
|
"bbox": page_bbox,
|
|
"width": page_width,
|
|
"height": page_height,
|
|
"rotation": page_rotation,
|
|
"blocks": blocks,
|
|
}
|