feat: refactor PDF handling to utilize pdfium_guard for resource management

This commit is contained in:
myhloli
2026-03-25 14:11:33 +08:00
parent 0eff2b0d70
commit cf8964c873
10 changed files with 247 additions and 186 deletions

View File

@@ -32,6 +32,11 @@ from mineru.utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_lis
update_det_boxes, OcrConfidence
from mineru.utils.pdf_classify import classify
from mineru.utils.pdf_image_tools import load_images_from_pdf, load_images_from_pdf_doc
from mineru.utils.pdfium_guard import (
close_pdfium_document,
get_pdfium_document_page_count,
open_pdfium_document,
)
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
@@ -618,15 +623,15 @@ def doc_analyze_low_memory(
_ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method)
_vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable)
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
middle_json = init_middle_json(_ocr_enable, _vlm_ocr_enable)
model_list = []
doc_closed = False
hybrid_pipeline_model = None
try:
page_count = len(pdf_doc)
page_count = get_pdfium_document_page_count(pdf_doc)
if page_count == 0:
pdf_doc.close()
close_pdfium_document(pdf_doc)
doc_closed = True
clean_memory(device)
return middle_json, model_list, _vlm_ocr_enable
@@ -702,13 +707,13 @@ def doc_analyze_low_memory(
_ocr_enable,
_vlm_ocr_enable,
)
pdf_doc.close()
close_pdfium_document(pdf_doc)
doc_closed = True
clean_memory(device)
return middle_json, model_list, _vlm_ocr_enable
finally:
if not doc_closed:
pdf_doc.close()
close_pdfium_document(pdf_doc)
async def aio_doc_analyze(
@@ -801,15 +806,15 @@ async def aio_doc_analyze_low_memory(
_ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method)
_vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable)
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
middle_json = init_middle_json(_ocr_enable, _vlm_ocr_enable)
model_list = []
doc_closed = False
hybrid_pipeline_model = None
try:
page_count = len(pdf_doc)
page_count = get_pdfium_document_page_count(pdf_doc)
if page_count == 0:
pdf_doc.close()
close_pdfium_document(pdf_doc)
doc_closed = True
clean_memory(device)
return middle_json, model_list, _vlm_ocr_enable
@@ -885,10 +890,10 @@ async def aio_doc_analyze_low_memory(
_ocr_enable,
_vlm_ocr_enable,
)
pdf_doc.close()
close_pdfium_document(pdf_doc)
doc_closed = True
clean_memory(device)
return middle_json, model_list, _vlm_ocr_enable
finally:
if not doc_closed:
pdf_doc.close()
close_pdfium_document(pdf_doc)

View File

@@ -16,6 +16,7 @@ from mineru.utils.enum_class import ContentType
from mineru.utils.hash_utils import bytes_md5
from mineru.utils.ocr_utils import OcrConfidence, rotate_vertical_crop_if_needed
from mineru.utils.pdf_image_tools import get_crop_img
from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard
from mineru.version import __version__
@@ -47,7 +48,8 @@ def blocks_to_page_info(
scale = image_dict["scale"]
page_pil_img = image_dict["img_pil"]
page_img_md5 = bytes_md5(page_pil_img.tobytes())
width, height = map(int, page.get_size())
with pdfium_guard():
width, height = map(int, page.get_size())
magic_model = MagicModel(
page_model_list,
@@ -189,7 +191,8 @@ def append_page_results_to_middle_json(
zip(model_list, images_list)
):
page_index = page_start_index + offset
page = pdf_doc[page_index]
with pdfium_guard():
page = pdf_doc[page_index]
page_info = blocks_to_page_info(
page_model_list,
image_dict,
@@ -271,5 +274,5 @@ def result_to_middle_json(
_ocr_enable,
_vlm_ocr_enable,
)
pdf_doc.close()
close_pdfium_document(pdf_doc)
return middle_json

View File

@@ -21,6 +21,7 @@ from mineru.backend.pipeline.pipeline_magic_model import MagicModel
from mineru.utils.ocr_utils import OcrConfidence, rotate_vertical_crop_if_needed
from mineru.version import __version__
from mineru.utils.hash_utils import bytes_md5, str_sha256
from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard
def _save_base64_image(b64_data_uri: str, image_writer, page_index: int):
@@ -89,7 +90,8 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
scale = image_dict["scale"]
page_pil_img = image_dict["img_pil"]
page_img_md5 = bytes_md5(page_pil_img.tobytes())
page_w, page_h = map(int, page.get_size())
with pdfium_guard():
page_w, page_h = map(int, page.get_size())
magic_model = MagicModel(
page_model_info,
page,
@@ -141,16 +143,19 @@ def append_page_model_infos_to_middle_json(
):
for offset, (page_model_info, image_dict) in enumerate(zip(page_model_infos, images_list)):
page_index = page_start_index + offset
with pdfium_guard():
page = pdf_doc[page_index]
page_info = page_model_info_to_page_info(
copy.deepcopy(page_model_info),
image_dict,
pdf_doc[page_index],
page,
image_writer,
page_index,
ocr_enable=ocr_enable,
)
if page_info is None:
page_w, page_h = map(int, pdf_doc[page_index].get_size())
with pdfium_guard():
page_w, page_h = map(int, pdf_doc[page_index].get_size())
page_info = make_page_info_dict([], page_index, page_w, page_h, [])
middle_json["pdf_info"].append(page_info)
if progress_bar is not None:
@@ -354,7 +359,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
)
finalize_middle_json(middle_json["pdf_info"], lang=lang, ocr_enable=ocr_enable)
pdf_doc.close()
close_pdfium_document(pdf_doc)
return middle_json

View File

@@ -15,6 +15,11 @@ from ...utils.enum_class import ImageType
from ...utils.pdf_classify import classify
from ...utils.pdf_image_tools import load_images_from_pdf, load_images_from_pdf_doc
from ...utils.model_utils import get_vram, clean_memory
from ...utils.pdfium_guard import (
close_pdfium_document,
get_pdfium_document_page_count,
open_pdfium_document,
)
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
@@ -113,7 +118,7 @@ def _close_doc_context(context):
if context['closed']:
return
try:
context['pdf_doc'].close()
close_pdfium_document(context['pdf_doc'])
except Exception:
pass
_close_images(context['images_list'])
@@ -138,7 +143,7 @@ def _finalize_low_memory_context(context, on_doc_ready):
context['middle_json'],
context['ocr_enable'],
)
context['pdf_doc'].close()
close_pdfium_document(context['pdf_doc'])
context['closed'] = True
@@ -180,7 +185,7 @@ def doc_analyze(
for pdf_doc in all_pdf_docs:
if pdf_doc is not None:
try:
pdf_doc.close()
close_pdfium_document(pdf_doc)
except Exception:
pass
for images_list in all_image_lists:
@@ -377,8 +382,8 @@ def doc_analyze_low_memory_multi_streaming(
total_pages = 0
for doc_index, (pdf_bytes, image_writer, lang) in enumerate(zip(pdf_bytes_list, image_writer_list, lang_list)):
_ocr_enable = _get_ocr_enable(pdf_bytes, parse_method)
pdf_doc = pdfium.PdfDocument(pdf_bytes)
page_count = len(pdf_doc)
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
page_count = get_pdfium_document_page_count(pdf_doc)
total_pages += page_count
doc_contexts.append(
{
@@ -494,7 +499,7 @@ def doc_analyze_low_memory_multi_streaming(
finally:
for context in doc_contexts:
if not context['closed']:
context['pdf_doc'].close()
close_pdfium_document(context['pdf_doc'])
context['closed'] = True

View File

@@ -13,6 +13,7 @@ from mineru.utils.cut_image import cut_image_and_table
from mineru.utils.enum_class import ContentType
from mineru.utils.hash_utils import bytes_md5
from mineru.utils.pdf_image_tools import get_crop_img
from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard
from mineru.version import __version__
@@ -37,7 +38,8 @@ def blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index)
# page_pil_img = image_dict["img_pil"]
page_pil_img = image_dict["img_pil"]
page_img_md5 = bytes_md5(page_pil_img.tobytes())
width, height = map(int, page.get_size())
with pdfium_guard():
width, height = map(int, page.get_size())
magic_model = MagicModel(page_blocks, width, height)
image_blocks = magic_model.get_image_blocks()
@@ -115,7 +117,8 @@ def append_page_blocks_to_middle_json(
):
for offset, (page_blocks, image_dict) in enumerate(zip(model_output_blocks_list, images_list)):
page_index = page_start_index + offset
page = pdf_doc[page_index]
with pdfium_guard():
page = pdf_doc[page_index]
page_info = blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index)
middle_json["pdf_info"].append(page_info)
if progress_bar is not None:
@@ -146,5 +149,5 @@ def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_
)
finalize_middle_json(middle_json["pdf_info"])
pdf_doc.close()
close_pdfium_document(pdf_doc)
return middle_json

View File

@@ -19,6 +19,11 @@ from ...utils.check_sys_env import is_mac_os_version_supported
from ...utils.config_reader import get_device, get_low_memory_window_size
from ...utils.enum_class import ImageType
from ...utils.pdfium_guard import (
close_pdfium_document,
get_pdfium_document_page_count,
open_pdfium_document,
)
from ...utils.models_download_utils import auto_download_and_get_model_root_path
from mineru_vl_utils import MinerUClient
@@ -322,14 +327,14 @@ def doc_analyze_low_memory(
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
predictor = _maybe_enable_serial_execution(predictor, backend)
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
middle_json = init_middle_json()
results = []
doc_closed = False
try:
page_count = len(pdf_doc)
page_count = get_pdfium_document_page_count(pdf_doc)
if page_count == 0:
pdf_doc.close()
close_pdfium_document(pdf_doc)
doc_closed = True
return middle_json, results
window_size = min(page_count, get_low_memory_window_size(default=64))
@@ -377,12 +382,12 @@ def doc_analyze_low_memory(
f"speed: {round(len(results) / infer_time, 3)} page/s"
)
finalize_middle_json(middle_json["pdf_info"])
pdf_doc.close()
close_pdfium_document(pdf_doc)
doc_closed = True
return middle_json, results
finally:
if not doc_closed:
pdf_doc.close()
close_pdfium_document(pdf_doc)
async def aio_doc_analyze(
@@ -426,14 +431,14 @@ async def aio_doc_analyze_low_memory(
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
predictor = _maybe_enable_serial_execution(predictor, backend)
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
middle_json = init_middle_json()
results = []
doc_closed = False
try:
page_count = len(pdf_doc)
page_count = get_pdfium_document_page_count(pdf_doc)
if page_count == 0:
pdf_doc.close()
close_pdfium_document(pdf_doc)
doc_closed = True
return middle_json, results
window_size = min(page_count, get_low_memory_window_size(default=64))
@@ -481,9 +486,9 @@ async def aio_doc_analyze_low_memory(
f"speed: {round(len(results) / infer_time, 3)} page/s"
)
finalize_middle_json(middle_json["pdf_info"])
pdf_doc.close()
close_pdfium_document(pdf_doc)
doc_closed = True
return middle_json, results
finally:
if not doc_closed:
pdf_doc.close()
close_pdfium_document(pdf_doc)

View File

@@ -1,7 +1,6 @@
# Copyright (c) Opendatalab. All rights reserved.
import os
import re
import threading
from io import BytesIO
import numpy as np
@@ -16,6 +15,11 @@ from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from mineru.utils.pdfium_guard import (
close_pdfium_document,
open_pdfium_document,
pdfium_guard,
)
PDF_CLASSIFY_STRATEGY_ENV = "MINERU_PDF_CLASSIFY_STRATEGY"
PDF_CLASSIFY_STRATEGY_HYBRID = "hybrid"
@@ -33,9 +37,6 @@ _ALLOWED_CONTROL_CODES = {9, 10, 13}
_PRIVATE_USE_AREA_START = 0xE000
_PRIVATE_USE_AREA_END = 0xF8FF
_pdf_sample_extract_lock = threading.Lock()
def classify(pdf_bytes):
"""
Classify a PDF as text-based or OCR-based.
@@ -79,48 +80,48 @@ def classify_hybrid(pdf_bytes):
should_run_pdfminer_fallback = False
try:
pdf = pdfium.PdfDocument(pdf_bytes)
page_count = len(pdf)
if page_count == 0:
return "ocr"
page_indices = get_sample_page_indices(page_count, MAX_SAMPLE_PAGES)
if not page_indices:
return "ocr"
if (
get_avg_cleaned_chars_per_page_pdfium(pdf, page_indices)
< CHARS_THRESHOLD
):
return "ocr"
if detect_cid_font_signal_pypdf(pdf_bytes, page_indices):
return "ocr"
text_quality_signal = get_text_quality_signal_pdfium(pdf, page_indices)
total_chars = text_quality_signal["total_chars"]
abnormal_ratio = text_quality_signal["abnormal_ratio"]
if total_chars >= TEXT_QUALITY_MIN_CHARS:
if abnormal_ratio >= TEXT_QUALITY_BAD_THRESHOLD:
with pdfium_guard():
pdf = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
page_count = len(pdf)
if page_count == 0:
return "ocr"
should_run_pdfminer_fallback = abnormal_ratio > TEXT_QUALITY_GOOD_THRESHOLD
else:
should_run_pdfminer_fallback = True
if (
get_high_image_coverage_ratio_pdfium(pdf, page_indices)
>= HIGH_IMAGE_COVERAGE_THRESHOLD
):
return "ocr"
page_indices = get_sample_page_indices(page_count, MAX_SAMPLE_PAGES)
if not page_indices:
return "ocr"
if (
get_avg_cleaned_chars_per_page_pdfium(pdf, page_indices)
< CHARS_THRESHOLD
):
return "ocr"
if detect_cid_font_signal_pypdf(pdf_bytes, page_indices):
return "ocr"
text_quality_signal = get_text_quality_signal_pdfium(pdf, page_indices)
total_chars = text_quality_signal["total_chars"]
abnormal_ratio = text_quality_signal["abnormal_ratio"]
if total_chars >= TEXT_QUALITY_MIN_CHARS:
if abnormal_ratio >= TEXT_QUALITY_BAD_THRESHOLD:
return "ocr"
should_run_pdfminer_fallback = abnormal_ratio > TEXT_QUALITY_GOOD_THRESHOLD
else:
should_run_pdfminer_fallback = True
if (
get_high_image_coverage_ratio_pdfium(pdf, page_indices)
>= HIGH_IMAGE_COVERAGE_THRESHOLD
):
return "ocr"
except Exception as e:
logger.error(f"Failed to classify PDF with hybrid strategy: {e}")
return "ocr"
finally:
if pdf is not None:
pdf.close()
close_pdfium_document(pdf)
if should_run_pdfminer_fallback:
sample_pdf_bytes = extract_selected_pages(pdf_bytes, page_indices)
@@ -140,33 +141,35 @@ def classify_legacy(pdf_bytes):
sample_pdf_bytes = extract_pages(pdf_bytes)
if not sample_pdf_bytes:
return "ocr"
pdf = pdfium.PdfDocument(sample_pdf_bytes)
pdf = None
try:
page_count = len(pdf)
if page_count == 0:
return "ocr"
with pdfium_guard():
pdf = open_pdfium_document(pdfium.PdfDocument, sample_pdf_bytes)
page_count = len(pdf)
if page_count == 0:
return "ocr"
pages_to_check = min(page_count, MAX_SAMPLE_PAGES)
pages_to_check = min(page_count, MAX_SAMPLE_PAGES)
if (
get_avg_cleaned_chars_per_page(pdf, pages_to_check) < CHARS_THRESHOLD
) or detect_invalid_chars(sample_pdf_bytes):
return "ocr"
if (
get_avg_cleaned_chars_per_page(pdf, pages_to_check) < CHARS_THRESHOLD
) or detect_invalid_chars(sample_pdf_bytes):
return "ocr"
if (
get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check)
>= HIGH_IMAGE_COVERAGE_THRESHOLD
):
return "ocr"
if (
get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check)
>= HIGH_IMAGE_COVERAGE_THRESHOLD
):
return "ocr"
return "txt"
return "txt"
except Exception as e:
logger.error(f"Failed to classify PDF with legacy strategy: {e}")
logger.warning(f"Failed to classify PDF with legacy strategy: {e}")
return "ocr"
finally:
pdf.close()
close_pdfium_document(pdf)
def get_sample_page_indices(page_count: int, max_pages: int = MAX_SAMPLE_PAGES):
@@ -402,11 +405,11 @@ def extract_pages(src_pdf_bytes: bytes) -> bytes:
Extract up to 10 random pages and return them as a new PDF.
"""
with _pdf_sample_extract_lock:
pdf = None
sample_docs = None
try:
pdf = pdfium.PdfDocument(src_pdf_bytes)
pdf = None
sample_docs = None
try:
with pdfium_guard():
pdf = open_pdfium_document(pdfium.PdfDocument, src_pdf_bytes)
total_page = len(pdf)
if total_page == 0:
logger.warning("PDF is empty, return empty document")
@@ -420,20 +423,18 @@ def extract_pages(src_pdf_bytes: bytes) -> bytes:
total_page, select_page_cnt, replace=False
).tolist()
sample_docs = pdfium.PdfDocument.new()
sample_docs = open_pdfium_document(pdfium.PdfDocument.new)
sample_docs.import_pages(pdf, page_indices)
output_buffer = BytesIO()
sample_docs.save(output_buffer)
return output_buffer.getvalue()
except Exception as e:
logger.exception(e)
return src_pdf_bytes
finally:
if pdf is not None:
pdf.close()
if sample_docs is not None:
sample_docs.close()
except Exception as e:
logger.exception(e)
return src_pdf_bytes
finally:
close_pdfium_document(pdf)
close_pdfium_document(sample_docs)
def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes:
@@ -445,11 +446,11 @@ def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes:
if not selected_page_indices:
return b""
with _pdf_sample_extract_lock:
pdf = None
sample_docs = None
try:
pdf = pdfium.PdfDocument(src_pdf_bytes)
pdf = None
sample_docs = None
try:
with pdfium_guard():
pdf = open_pdfium_document(pdfium.PdfDocument, src_pdf_bytes)
total_page = len(pdf)
if total_page == 0:
logger.warning("PDF is empty, return empty document")
@@ -466,20 +467,18 @@ def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes:
if selected_page_indices == list(range(total_page)):
return src_pdf_bytes
sample_docs = pdfium.PdfDocument.new()
sample_docs = open_pdfium_document(pdfium.PdfDocument.new)
sample_docs.import_pages(pdf, selected_page_indices)
output_buffer = BytesIO()
sample_docs.save(output_buffer)
return output_buffer.getvalue()
except Exception as e:
logger.exception(e)
return src_pdf_bytes
finally:
if pdf is not None:
pdf.close()
if sample_docs is not None:
sample_docs.close()
except Exception as e:
logger.exception(e)
return src_pdf_bytes
finally:
close_pdfium_document(pdf)
close_pdfium_document(sample_docs)
def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool:

View File

@@ -17,6 +17,12 @@ from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_ima
from mineru.utils.enum_class import ImageType
from mineru.utils.hash_utils import str_sha256
from mineru.utils.pdf_page_id import get_end_page_id
from mineru.utils.pdfium_guard import (
close_pdfium_document,
get_pdfium_document_page_count,
open_pdfium_document,
pdfium_guard,
)
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
@@ -130,23 +136,34 @@ def load_images_from_pdf(
Raises:
TimeoutError: 当转换超时时抛出
"""
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
if is_windows_environment():
# Windows 环境下不使用多进程
return load_images_from_pdf_core(
pdf_bytes,
dpi,
start_page_id,
get_end_page_id(end_page_id, len(pdf_doc)),
image_type,
), pdf_doc
try:
images_list = load_images_from_pdf_core(
pdf_bytes,
dpi,
start_page_id,
get_end_page_id(
end_page_id,
get_pdfium_document_page_count(pdf_doc),
),
image_type,
)
return images_list, pdf_doc
except Exception:
close_pdfium_document(pdf_doc)
raise
else:
if timeout is None:
timeout = get_load_images_timeout()
if threads is None:
threads = get_load_images_threads()
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
end_page_id = get_end_page_id(
end_page_id,
get_pdfium_document_page_count(pdf_doc),
)
actual_threads, page_ranges = _get_render_process_plan(
start_page_id,
end_page_id,
@@ -179,7 +196,7 @@ def load_images_from_pdf(
if not_done:
# 超时:强制终止所有子进程
_terminate_executor_processes(executor)
pdf_doc.close()
close_pdfium_document(pdf_doc)
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
# 所有任务完成,收集结果
@@ -201,7 +218,7 @@ def load_images_from_pdf(
except Exception as e:
# 发生任何异常时,确保清理子进程
_terminate_executor_processes(executor)
pdf_doc.close()
close_pdfium_document(pdf_doc)
if isinstance(e, TimeoutError):
raise
raise
@@ -240,17 +257,20 @@ def load_images_from_pdf_core(
image_type=ImageType.PIL, # PIL or BASE64
):
images_list = []
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_page_num = len(pdf_doc)
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
pdf_doc = None
try:
with pdfium_guard():
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
pdf_page_num = len(pdf_doc)
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
for index in range(start_page_id, end_page_id + 1):
# logger.debug(f"Converting page {index}/{pdf_page_num} to image")
page = pdf_doc[index]
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
images_list.append(image_dict)
pdf_doc.close()
for index in range(start_page_id, end_page_id + 1):
# logger.debug(f"Converting page {index}/{pdf_page_num} to image")
page = pdf_doc[index]
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
images_list.append(image_dict)
finally:
close_pdfium_document(pdf_doc)
return images_list
@@ -263,13 +283,14 @@ def load_images_from_pdf_doc(
image_type=ImageType.PIL,
):
images_list = []
pdf_page_num = len(pdf_doc)
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
with pdfium_guard():
pdf_page_num = len(pdf_doc)
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
for index in range(start_page_id, end_page_id + 1):
page = pdf_doc[index]
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
images_list.append(image_dict)
for index in range(start_page_id, end_page_id + 1):
page = pdf_doc[index]
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
images_list.append(image_dict)
return images_list
@@ -354,4 +375,4 @@ def images_bytes_to_pdf_bytes(image_bytes):
# 获取 PDF bytes 并重置指针(可选)
pdf_bytes = pdf_buffer.getvalue()
pdf_buffer.close()
return pdf_bytes
return pdf_bytes

View File

@@ -5,6 +5,11 @@ from io import BytesIO
from loguru import logger
from PIL import Image
from pypdfium2 import PdfBitmap, PdfDocument, PdfPage
from mineru.utils.pdfium_guard import (
close_pdfium_document,
open_pdfium_document,
pdfium_guard,
)
def page_to_image(
@@ -12,19 +17,20 @@ def page_to_image(
dpi: int = 200,
max_width_or_height: int = 3500, # changed from 4500 to 3500
) -> (Image.Image, float):
scale = dpi / 72
with pdfium_guard():
scale = dpi / 72
long_side_length = max(*page.get_size())
if (long_side_length*scale) > max_width_or_height:
scale = max_width_or_height / long_side_length
long_side_length = max(*page.get_size())
if (long_side_length*scale) > max_width_or_height:
scale = max_width_or_height / long_side_length
bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
image = bitmap.to_pil()
try:
bitmap.close()
except Exception as e:
logger.error(f"Failed to close bitmap: {e}")
image = bitmap.to_pil()
try:
bitmap.close()
except Exception as e:
logger.error(f"Failed to close bitmap: {e}")
return image, scale
@@ -65,25 +71,23 @@ def pdf_to_images(
start_page_id: int = 0,
end_page_id: int | None = None,
) -> list[Image.Image]:
doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf)
page_num = len(doc)
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
if end_page_id > page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = page_num - 1
images = []
doc = pdf if isinstance(pdf, PdfDocument) else open_pdfium_document(PdfDocument, pdf)
try:
for i in range(start_page_id, end_page_id + 1):
image, _ = page_to_image(doc[i], dpi, max_width_or_height)
images.append(image)
with pdfium_guard():
page_num = len(doc)
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
if end_page_id > page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = page_num - 1
images = []
for i in range(start_page_id, end_page_id + 1):
image, _ = page_to_image(doc[i], dpi, max_width_or_height)
images.append(image)
return images
finally:
try:
doc.close()
except Exception:
pass
return images
close_pdfium_document(doc)
def pdf_to_images_bytes(

View File

@@ -1,18 +1,20 @@
from typing import List
import math
from typing import List
import pypdfium2 as pdfium
from pdftext.pdf.chars import get_chars, deduplicate_chars
from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks
from pdftext.pdf.chars import deduplicate_chars, get_chars
from pdftext.pdf.pages import assign_scripts, get_blocks, get_lines, get_spans
from mineru.utils.pdfium_guard import pdfium_guard
def get_page(
page: pdfium.PdfPage,
quote_loosebox: bool =True,
quote_loosebox: bool = True,
superscript_height_threshold: float = 0.7,
line_distance_threshold: float = 0.1,
) -> dict:
with pdfium_guard():
textpage = page.get_textpage()
page_bbox: List[float] = page.get_bbox()
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
@@ -21,20 +23,29 @@ def get_page(
page_rotation = 0
try:
page_rotation = page.get_rotation()
except:
except Exception:
pass
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
chars = deduplicate_chars(
get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
)
spans = get_spans(
chars,
superscript_height_threshold=superscript_height_threshold,
line_distance_threshold=line_distance_threshold,
)
lines = get_lines(spans)
assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
assign_scripts(
lines,
height_threshold=superscript_height_threshold,
line_distance_threshold=line_distance_threshold,
)
blocks = get_blocks(lines)
page = {
return {
"bbox": page_bbox,
"width": page_width,
"height": page_height,
"rotation": page_rotation,
"blocks": blocks
"blocks": blocks,
}
return page