diff --git a/mineru/backend/hybrid/hybrid_analyze.py b/mineru/backend/hybrid/hybrid_analyze.py index e76dca59..aae4bfb8 100644 --- a/mineru/backend/hybrid/hybrid_analyze.py +++ b/mineru/backend/hybrid/hybrid_analyze.py @@ -32,6 +32,11 @@ from mineru.utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_lis update_det_boxes, OcrConfidence from mineru.utils.pdf_classify import classify from mineru.utils.pdf_image_tools import load_images_from_pdf, load_images_from_pdf_doc +from mineru.utils.pdfium_guard import ( + close_pdfium_document, + get_pdfium_document_page_count, + open_pdfium_document, +) os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 @@ -618,15 +623,15 @@ def doc_analyze_low_memory( _ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method) _vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable) - pdf_doc = pdfium.PdfDocument(pdf_bytes) + pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes) middle_json = init_middle_json(_ocr_enable, _vlm_ocr_enable) model_list = [] doc_closed = False hybrid_pipeline_model = None try: - page_count = len(pdf_doc) + page_count = get_pdfium_document_page_count(pdf_doc) if page_count == 0: - pdf_doc.close() + close_pdfium_document(pdf_doc) doc_closed = True clean_memory(device) return middle_json, model_list, _vlm_ocr_enable @@ -702,13 +707,13 @@ def doc_analyze_low_memory( _ocr_enable, _vlm_ocr_enable, ) - pdf_doc.close() + close_pdfium_document(pdf_doc) doc_closed = True clean_memory(device) return middle_json, model_list, _vlm_ocr_enable finally: if not doc_closed: - pdf_doc.close() + close_pdfium_document(pdf_doc) async def aio_doc_analyze( @@ -801,15 +806,15 @@ async def aio_doc_analyze_low_memory( _ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method) _vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable) - pdf_doc = pdfium.PdfDocument(pdf_bytes) + pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes) middle_json = init_middle_json(_ocr_enable, _vlm_ocr_enable) model_list = [] doc_closed = False hybrid_pipeline_model = None try: - page_count = len(pdf_doc) + page_count = get_pdfium_document_page_count(pdf_doc) if page_count == 0: - pdf_doc.close() + close_pdfium_document(pdf_doc) doc_closed = True clean_memory(device) return middle_json, model_list, _vlm_ocr_enable @@ -885,10 +890,10 @@ async def aio_doc_analyze_low_memory( _ocr_enable, _vlm_ocr_enable, ) - pdf_doc.close() + close_pdfium_document(pdf_doc) doc_closed = True clean_memory(device) return middle_json, model_list, _vlm_ocr_enable finally: if not doc_closed: - pdf_doc.close() + close_pdfium_document(pdf_doc) diff --git a/mineru/backend/hybrid/hybrid_model_output_to_middle_json.py b/mineru/backend/hybrid/hybrid_model_output_to_middle_json.py index f5c04d9a..0c40e05a 100644 --- a/mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +++ b/mineru/backend/hybrid/hybrid_model_output_to_middle_json.py @@ -16,6 +16,7 @@ from mineru.utils.enum_class import ContentType from mineru.utils.hash_utils import bytes_md5 from mineru.utils.ocr_utils import OcrConfidence, rotate_vertical_crop_if_needed from mineru.utils.pdf_image_tools import get_crop_img +from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard from mineru.version import __version__ @@ -47,7 +48,8 @@ def blocks_to_page_info( scale = image_dict["scale"] page_pil_img = image_dict["img_pil"] page_img_md5 = bytes_md5(page_pil_img.tobytes()) - width, height = map(int, page.get_size()) + with pdfium_guard(): + width, height = map(int, page.get_size()) magic_model = MagicModel( page_model_list, @@ -189,7 +191,8 @@ def append_page_results_to_middle_json( zip(model_list, images_list) ): page_index = page_start_index + offset - page = pdf_doc[page_index] + with pdfium_guard(): + page = pdf_doc[page_index] page_info = blocks_to_page_info( page_model_list, image_dict, @@ -271,5 +274,5 @@ def result_to_middle_json( _ocr_enable, _vlm_ocr_enable, ) - pdf_doc.close() + close_pdfium_document(pdf_doc) return middle_json diff --git a/mineru/backend/pipeline/model_json_to_middle_json.py b/mineru/backend/pipeline/model_json_to_middle_json.py index 5f66e6f5..0a0baa0d 100644 --- a/mineru/backend/pipeline/model_json_to_middle_json.py +++ b/mineru/backend/pipeline/model_json_to_middle_json.py @@ -21,6 +21,7 @@ from mineru.backend.pipeline.pipeline_magic_model import MagicModel from mineru.utils.ocr_utils import OcrConfidence, rotate_vertical_crop_if_needed from mineru.version import __version__ from mineru.utils.hash_utils import bytes_md5, str_sha256 +from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard def _save_base64_image(b64_data_uri: str, image_writer, page_index: int): @@ -89,7 +90,8 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer scale = image_dict["scale"] page_pil_img = image_dict["img_pil"] page_img_md5 = bytes_md5(page_pil_img.tobytes()) - page_w, page_h = map(int, page.get_size()) + with pdfium_guard(): + page_w, page_h = map(int, page.get_size()) magic_model = MagicModel( page_model_info, page, @@ -141,16 +143,19 @@ def append_page_model_infos_to_middle_json( ): for offset, (page_model_info, image_dict) in enumerate(zip(page_model_infos, images_list)): page_index = page_start_index + offset + with pdfium_guard(): + page = pdf_doc[page_index] page_info = page_model_info_to_page_info( copy.deepcopy(page_model_info), image_dict, - pdf_doc[page_index], + page, image_writer, page_index, ocr_enable=ocr_enable, ) if page_info is None: - page_w, page_h = map(int, pdf_doc[page_index].get_size()) + with pdfium_guard(): + page_w, page_h = map(int, pdf_doc[page_index].get_size()) page_info = make_page_info_dict([], page_index, page_w, page_h, []) middle_json["pdf_info"].append(page_info) if progress_bar is not None: @@ -354,7 +359,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N ) finalize_middle_json(middle_json["pdf_info"], lang=lang, ocr_enable=ocr_enable) - pdf_doc.close() + close_pdfium_document(pdf_doc) return middle_json diff --git a/mineru/backend/pipeline/pipeline_analyze.py b/mineru/backend/pipeline/pipeline_analyze.py index a625528f..4a988e09 100644 --- a/mineru/backend/pipeline/pipeline_analyze.py +++ b/mineru/backend/pipeline/pipeline_analyze.py @@ -15,6 +15,11 @@ from ...utils.enum_class import ImageType from ...utils.pdf_classify import classify from ...utils.pdf_image_tools import load_images_from_pdf, load_images_from_pdf_doc from ...utils.model_utils import get_vram, clean_memory +from ...utils.pdfium_guard import ( + close_pdfium_document, + get_pdfium_document_page_count, + open_pdfium_document, +) os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback @@ -113,7 +118,7 @@ def _close_doc_context(context): if context['closed']: return try: - context['pdf_doc'].close() + close_pdfium_document(context['pdf_doc']) except Exception: pass _close_images(context['images_list']) @@ -138,7 +143,7 @@ def _finalize_low_memory_context(context, on_doc_ready): context['middle_json'], context['ocr_enable'], ) - context['pdf_doc'].close() + close_pdfium_document(context['pdf_doc']) context['closed'] = True @@ -180,7 +185,7 @@ def doc_analyze( for pdf_doc in all_pdf_docs: if pdf_doc is not None: try: - pdf_doc.close() + close_pdfium_document(pdf_doc) except Exception: pass for images_list in all_image_lists: @@ -377,8 +382,8 @@ def doc_analyze_low_memory_multi_streaming( total_pages = 0 for doc_index, (pdf_bytes, image_writer, lang) in enumerate(zip(pdf_bytes_list, image_writer_list, lang_list)): _ocr_enable = _get_ocr_enable(pdf_bytes, parse_method) - pdf_doc = pdfium.PdfDocument(pdf_bytes) - page_count = len(pdf_doc) + pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes) + page_count = get_pdfium_document_page_count(pdf_doc) total_pages += page_count doc_contexts.append( { @@ -494,7 +499,7 @@ def doc_analyze_low_memory_multi_streaming( finally: for context in doc_contexts: if not context['closed']: - context['pdf_doc'].close() + close_pdfium_document(context['pdf_doc']) context['closed'] = True diff --git a/mineru/backend/vlm/model_output_to_middle_json.py b/mineru/backend/vlm/model_output_to_middle_json.py index b2dec917..01cdfe2e 100644 --- a/mineru/backend/vlm/model_output_to_middle_json.py +++ b/mineru/backend/vlm/model_output_to_middle_json.py @@ -13,6 +13,7 @@ from mineru.utils.cut_image import cut_image_and_table from mineru.utils.enum_class import ContentType from mineru.utils.hash_utils import bytes_md5 from mineru.utils.pdf_image_tools import get_crop_img +from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard from mineru.version import __version__ @@ -37,7 +38,8 @@ def blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index) # page_pil_img = image_dict["img_pil"] page_pil_img = image_dict["img_pil"] page_img_md5 = bytes_md5(page_pil_img.tobytes()) - width, height = map(int, page.get_size()) + with pdfium_guard(): + width, height = map(int, page.get_size()) magic_model = MagicModel(page_blocks, width, height) image_blocks = magic_model.get_image_blocks() @@ -115,7 +117,8 @@ def append_page_blocks_to_middle_json( ): for offset, (page_blocks, image_dict) in enumerate(zip(model_output_blocks_list, images_list)): page_index = page_start_index + offset - page = pdf_doc[page_index] + with pdfium_guard(): + page = pdf_doc[page_index] page_info = blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index) middle_json["pdf_info"].append(page_info) if progress_bar is not None: @@ -146,5 +149,5 @@ def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_ ) finalize_middle_json(middle_json["pdf_info"]) - pdf_doc.close() + close_pdfium_document(pdf_doc) return middle_json diff --git a/mineru/backend/vlm/vlm_analyze.py b/mineru/backend/vlm/vlm_analyze.py index 114a6848..33790a3f 100644 --- a/mineru/backend/vlm/vlm_analyze.py +++ b/mineru/backend/vlm/vlm_analyze.py @@ -19,6 +19,11 @@ from ...utils.check_sys_env import is_mac_os_version_supported from ...utils.config_reader import get_device, get_low_memory_window_size from ...utils.enum_class import ImageType +from ...utils.pdfium_guard import ( + close_pdfium_document, + get_pdfium_document_page_count, + open_pdfium_document, +) from ...utils.models_download_utils import auto_download_and_get_model_root_path from mineru_vl_utils import MinerUClient @@ -322,14 +327,14 @@ def doc_analyze_low_memory( predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs) predictor = _maybe_enable_serial_execution(predictor, backend) - pdf_doc = pdfium.PdfDocument(pdf_bytes) + pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes) middle_json = init_middle_json() results = [] doc_closed = False try: - page_count = len(pdf_doc) + page_count = get_pdfium_document_page_count(pdf_doc) if page_count == 0: - pdf_doc.close() + close_pdfium_document(pdf_doc) doc_closed = True return middle_json, results window_size = min(page_count, get_low_memory_window_size(default=64)) @@ -377,12 +382,12 @@ def doc_analyze_low_memory( f"speed: {round(len(results) / infer_time, 3)} page/s" ) finalize_middle_json(middle_json["pdf_info"]) - pdf_doc.close() + close_pdfium_document(pdf_doc) doc_closed = True return middle_json, results finally: if not doc_closed: - pdf_doc.close() + close_pdfium_document(pdf_doc) async def aio_doc_analyze( @@ -426,14 +431,14 @@ async def aio_doc_analyze_low_memory( predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs) predictor = _maybe_enable_serial_execution(predictor, backend) - pdf_doc = pdfium.PdfDocument(pdf_bytes) + pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes) middle_json = init_middle_json() results = [] doc_closed = False try: - page_count = len(pdf_doc) + page_count = get_pdfium_document_page_count(pdf_doc) if page_count == 0: - pdf_doc.close() + close_pdfium_document(pdf_doc) doc_closed = True return middle_json, results window_size = min(page_count, get_low_memory_window_size(default=64)) @@ -481,9 +486,9 @@ async def aio_doc_analyze_low_memory( f"speed: {round(len(results) / infer_time, 3)} page/s" ) finalize_middle_json(middle_json["pdf_info"]) - pdf_doc.close() + close_pdfium_document(pdf_doc) doc_closed = True return middle_json, results finally: if not doc_closed: - pdf_doc.close() + close_pdfium_document(pdf_doc) diff --git a/mineru/utils/pdf_classify.py b/mineru/utils/pdf_classify.py index 1a28197d..1e84e088 100644 --- a/mineru/utils/pdf_classify.py +++ b/mineru/utils/pdf_classify.py @@ -1,7 +1,6 @@ # Copyright (c) Opendatalab. All rights reserved. import os import re -import threading from io import BytesIO import numpy as np @@ -16,6 +15,11 @@ from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser +from mineru.utils.pdfium_guard import ( + close_pdfium_document, + open_pdfium_document, + pdfium_guard, +) PDF_CLASSIFY_STRATEGY_ENV = "MINERU_PDF_CLASSIFY_STRATEGY" PDF_CLASSIFY_STRATEGY_HYBRID = "hybrid" @@ -33,9 +37,6 @@ _ALLOWED_CONTROL_CODES = {9, 10, 13} _PRIVATE_USE_AREA_START = 0xE000 _PRIVATE_USE_AREA_END = 0xF8FF -_pdf_sample_extract_lock = threading.Lock() - - def classify(pdf_bytes): """ Classify a PDF as text-based or OCR-based. @@ -79,48 +80,48 @@ def classify_hybrid(pdf_bytes): should_run_pdfminer_fallback = False try: - pdf = pdfium.PdfDocument(pdf_bytes) - page_count = len(pdf) - if page_count == 0: - return "ocr" - - page_indices = get_sample_page_indices(page_count, MAX_SAMPLE_PAGES) - if not page_indices: - return "ocr" - - if ( - get_avg_cleaned_chars_per_page_pdfium(pdf, page_indices) - < CHARS_THRESHOLD - ): - return "ocr" - - if detect_cid_font_signal_pypdf(pdf_bytes, page_indices): - return "ocr" - - text_quality_signal = get_text_quality_signal_pdfium(pdf, page_indices) - total_chars = text_quality_signal["total_chars"] - abnormal_ratio = text_quality_signal["abnormal_ratio"] - - if total_chars >= TEXT_QUALITY_MIN_CHARS: - if abnormal_ratio >= TEXT_QUALITY_BAD_THRESHOLD: + with pdfium_guard(): + pdf = open_pdfium_document(pdfium.PdfDocument, pdf_bytes) + page_count = len(pdf) + if page_count == 0: return "ocr" - should_run_pdfminer_fallback = abnormal_ratio > TEXT_QUALITY_GOOD_THRESHOLD - else: - should_run_pdfminer_fallback = True - if ( - get_high_image_coverage_ratio_pdfium(pdf, page_indices) - >= HIGH_IMAGE_COVERAGE_THRESHOLD - ): - return "ocr" + page_indices = get_sample_page_indices(page_count, MAX_SAMPLE_PAGES) + if not page_indices: + return "ocr" + + if ( + get_avg_cleaned_chars_per_page_pdfium(pdf, page_indices) + < CHARS_THRESHOLD + ): + return "ocr" + + if detect_cid_font_signal_pypdf(pdf_bytes, page_indices): + return "ocr" + + text_quality_signal = get_text_quality_signal_pdfium(pdf, page_indices) + total_chars = text_quality_signal["total_chars"] + abnormal_ratio = text_quality_signal["abnormal_ratio"] + + if total_chars >= TEXT_QUALITY_MIN_CHARS: + if abnormal_ratio >= TEXT_QUALITY_BAD_THRESHOLD: + return "ocr" + should_run_pdfminer_fallback = abnormal_ratio > TEXT_QUALITY_GOOD_THRESHOLD + else: + should_run_pdfminer_fallback = True + + if ( + get_high_image_coverage_ratio_pdfium(pdf, page_indices) + >= HIGH_IMAGE_COVERAGE_THRESHOLD + ): + return "ocr" except Exception as e: logger.error(f"Failed to classify PDF with hybrid strategy: {e}") return "ocr" finally: - if pdf is not None: - pdf.close() + close_pdfium_document(pdf) if should_run_pdfminer_fallback: sample_pdf_bytes = extract_selected_pages(pdf_bytes, page_indices) @@ -140,33 +141,35 @@ def classify_legacy(pdf_bytes): sample_pdf_bytes = extract_pages(pdf_bytes) if not sample_pdf_bytes: return "ocr" - pdf = pdfium.PdfDocument(sample_pdf_bytes) + pdf = None try: - page_count = len(pdf) - if page_count == 0: - return "ocr" + with pdfium_guard(): + pdf = open_pdfium_document(pdfium.PdfDocument, sample_pdf_bytes) + page_count = len(pdf) + if page_count == 0: + return "ocr" - pages_to_check = min(page_count, MAX_SAMPLE_PAGES) + pages_to_check = min(page_count, MAX_SAMPLE_PAGES) - if ( - get_avg_cleaned_chars_per_page(pdf, pages_to_check) < CHARS_THRESHOLD - ) or detect_invalid_chars(sample_pdf_bytes): - return "ocr" + if ( + get_avg_cleaned_chars_per_page(pdf, pages_to_check) < CHARS_THRESHOLD + ) or detect_invalid_chars(sample_pdf_bytes): + return "ocr" - if ( - get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) - >= HIGH_IMAGE_COVERAGE_THRESHOLD - ): - return "ocr" + if ( + get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) + >= HIGH_IMAGE_COVERAGE_THRESHOLD + ): + return "ocr" - return "txt" + return "txt" except Exception as e: - logger.error(f"Failed to classify PDF with legacy strategy: {e}") + logger.warning(f"Failed to classify PDF with legacy strategy: {e}") return "ocr" finally: - pdf.close() + close_pdfium_document(pdf) def get_sample_page_indices(page_count: int, max_pages: int = MAX_SAMPLE_PAGES): @@ -402,11 +405,11 @@ def extract_pages(src_pdf_bytes: bytes) -> bytes: Extract up to 10 random pages and return them as a new PDF. """ - with _pdf_sample_extract_lock: - pdf = None - sample_docs = None - try: - pdf = pdfium.PdfDocument(src_pdf_bytes) + pdf = None + sample_docs = None + try: + with pdfium_guard(): + pdf = open_pdfium_document(pdfium.PdfDocument, src_pdf_bytes) total_page = len(pdf) if total_page == 0: logger.warning("PDF is empty, return empty document") @@ -420,20 +423,18 @@ def extract_pages(src_pdf_bytes: bytes) -> bytes: total_page, select_page_cnt, replace=False ).tolist() - sample_docs = pdfium.PdfDocument.new() + sample_docs = open_pdfium_document(pdfium.PdfDocument.new) sample_docs.import_pages(pdf, page_indices) output_buffer = BytesIO() sample_docs.save(output_buffer) return output_buffer.getvalue() - except Exception as e: - logger.exception(e) - return src_pdf_bytes - finally: - if pdf is not None: - pdf.close() - if sample_docs is not None: - sample_docs.close() + except Exception as e: + logger.exception(e) + return src_pdf_bytes + finally: + close_pdfium_document(pdf) + close_pdfium_document(sample_docs) def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes: @@ -445,11 +446,11 @@ def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes: if not selected_page_indices: return b"" - with _pdf_sample_extract_lock: - pdf = None - sample_docs = None - try: - pdf = pdfium.PdfDocument(src_pdf_bytes) + pdf = None + sample_docs = None + try: + with pdfium_guard(): + pdf = open_pdfium_document(pdfium.PdfDocument, src_pdf_bytes) total_page = len(pdf) if total_page == 0: logger.warning("PDF is empty, return empty document") @@ -466,20 +467,18 @@ def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes: if selected_page_indices == list(range(total_page)): return src_pdf_bytes - sample_docs = pdfium.PdfDocument.new() + sample_docs = open_pdfium_document(pdfium.PdfDocument.new) sample_docs.import_pages(pdf, selected_page_indices) output_buffer = BytesIO() sample_docs.save(output_buffer) return output_buffer.getvalue() - except Exception as e: - logger.exception(e) - return src_pdf_bytes - finally: - if pdf is not None: - pdf.close() - if sample_docs is not None: - sample_docs.close() + except Exception as e: + logger.exception(e) + return src_pdf_bytes + finally: + close_pdfium_document(pdf) + close_pdfium_document(sample_docs) def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool: diff --git a/mineru/utils/pdf_image_tools.py b/mineru/utils/pdf_image_tools.py index 4b702fee..1627c387 100644 --- a/mineru/utils/pdf_image_tools.py +++ b/mineru/utils/pdf_image_tools.py @@ -17,6 +17,12 @@ from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_ima from mineru.utils.enum_class import ImageType from mineru.utils.hash_utils import str_sha256 from mineru.utils.pdf_page_id import get_end_page_id +from mineru.utils.pdfium_guard import ( + close_pdfium_document, + get_pdfium_document_page_count, + open_pdfium_document, + pdfium_guard, +) from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED @@ -130,23 +136,34 @@ def load_images_from_pdf( Raises: TimeoutError: 当转换超时时抛出 """ - pdf_doc = pdfium.PdfDocument(pdf_bytes) + pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes) if is_windows_environment(): # Windows 环境下不使用多进程 - return load_images_from_pdf_core( - pdf_bytes, - dpi, - start_page_id, - get_end_page_id(end_page_id, len(pdf_doc)), - image_type, - ), pdf_doc + try: + images_list = load_images_from_pdf_core( + pdf_bytes, + dpi, + start_page_id, + get_end_page_id( + end_page_id, + get_pdfium_document_page_count(pdf_doc), + ), + image_type, + ) + return images_list, pdf_doc + except Exception: + close_pdfium_document(pdf_doc) + raise else: if timeout is None: timeout = get_load_images_timeout() if threads is None: threads = get_load_images_threads() - end_page_id = get_end_page_id(end_page_id, len(pdf_doc)) + end_page_id = get_end_page_id( + end_page_id, + get_pdfium_document_page_count(pdf_doc), + ) actual_threads, page_ranges = _get_render_process_plan( start_page_id, end_page_id, @@ -179,7 +196,7 @@ def load_images_from_pdf( if not_done: # 超时:强制终止所有子进程 _terminate_executor_processes(executor) - pdf_doc.close() + close_pdfium_document(pdf_doc) raise TimeoutError(f"PDF to images conversion timeout after {timeout}s") # 所有任务完成,收集结果 @@ -201,7 +218,7 @@ def load_images_from_pdf( except Exception as e: # 发生任何异常时,确保清理子进程 _terminate_executor_processes(executor) - pdf_doc.close() + close_pdfium_document(pdf_doc) if isinstance(e, TimeoutError): raise raise @@ -240,17 +257,20 @@ def load_images_from_pdf_core( image_type=ImageType.PIL, # PIL or BASE64 ): images_list = [] - pdf_doc = pdfium.PdfDocument(pdf_bytes) - pdf_page_num = len(pdf_doc) - end_page_id = get_end_page_id(end_page_id, pdf_page_num) + pdf_doc = None + try: + with pdfium_guard(): + pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes) + pdf_page_num = len(pdf_doc) + end_page_id = get_end_page_id(end_page_id, pdf_page_num) - for index in range(start_page_id, end_page_id + 1): - # logger.debug(f"Converting page {index}/{pdf_page_num} to image") - page = pdf_doc[index] - image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type) - images_list.append(image_dict) - - pdf_doc.close() + for index in range(start_page_id, end_page_id + 1): + # logger.debug(f"Converting page {index}/{pdf_page_num} to image") + page = pdf_doc[index] + image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type) + images_list.append(image_dict) + finally: + close_pdfium_document(pdf_doc) return images_list @@ -263,13 +283,14 @@ def load_images_from_pdf_doc( image_type=ImageType.PIL, ): images_list = [] - pdf_page_num = len(pdf_doc) - end_page_id = get_end_page_id(end_page_id, pdf_page_num) + with pdfium_guard(): + pdf_page_num = len(pdf_doc) + end_page_id = get_end_page_id(end_page_id, pdf_page_num) - for index in range(start_page_id, end_page_id + 1): - page = pdf_doc[index] - image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type) - images_list.append(image_dict) + for index in range(start_page_id, end_page_id + 1): + page = pdf_doc[index] + image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type) + images_list.append(image_dict) return images_list @@ -354,4 +375,4 @@ def images_bytes_to_pdf_bytes(image_bytes): # 获取 PDF bytes 并重置指针(可选) pdf_bytes = pdf_buffer.getvalue() pdf_buffer.close() - return pdf_bytes \ No newline at end of file + return pdf_bytes diff --git a/mineru/utils/pdf_reader.py b/mineru/utils/pdf_reader.py index 5da907eb..27107f16 100644 --- a/mineru/utils/pdf_reader.py +++ b/mineru/utils/pdf_reader.py @@ -5,6 +5,11 @@ from io import BytesIO from loguru import logger from PIL import Image from pypdfium2 import PdfBitmap, PdfDocument, PdfPage +from mineru.utils.pdfium_guard import ( + close_pdfium_document, + open_pdfium_document, + pdfium_guard, +) def page_to_image( @@ -12,19 +17,20 @@ def page_to_image( dpi: int = 200, max_width_or_height: int = 3500, # changed from 4500 to 3500 ) -> (Image.Image, float): - scale = dpi / 72 + with pdfium_guard(): + scale = dpi / 72 - long_side_length = max(*page.get_size()) - if (long_side_length*scale) > max_width_or_height: - scale = max_width_or_height / long_side_length + long_side_length = max(*page.get_size()) + if (long_side_length*scale) > max_width_or_height: + scale = max_width_or_height / long_side_length - bitmap: PdfBitmap = page.render(scale=scale) # type: ignore + bitmap: PdfBitmap = page.render(scale=scale) # type: ignore - image = bitmap.to_pil() - try: - bitmap.close() - except Exception as e: - logger.error(f"Failed to close bitmap: {e}") + image = bitmap.to_pil() + try: + bitmap.close() + except Exception as e: + logger.error(f"Failed to close bitmap: {e}") return image, scale @@ -65,25 +71,23 @@ def pdf_to_images( start_page_id: int = 0, end_page_id: int | None = None, ) -> list[Image.Image]: - doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf) - page_num = len(doc) - - end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1 - if end_page_id > page_num - 1: - logger.warning("end_page_id is out of range, use images length") - end_page_id = page_num - 1 - - images = [] + doc = pdf if isinstance(pdf, PdfDocument) else open_pdfium_document(PdfDocument, pdf) try: - for i in range(start_page_id, end_page_id + 1): - image, _ = page_to_image(doc[i], dpi, max_width_or_height) - images.append(image) + with pdfium_guard(): + page_num = len(doc) + + end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1 + if end_page_id > page_num - 1: + logger.warning("end_page_id is out of range, use images length") + end_page_id = page_num - 1 + + images = [] + for i in range(start_page_id, end_page_id + 1): + image, _ = page_to_image(doc[i], dpi, max_width_or_height) + images.append(image) + return images finally: - try: - doc.close() - except Exception: - pass - return images + close_pdfium_document(doc) def pdf_to_images_bytes( diff --git a/mineru/utils/pdf_text_tool.py b/mineru/utils/pdf_text_tool.py index 86357439..ab4cee9a 100644 --- a/mineru/utils/pdf_text_tool.py +++ b/mineru/utils/pdf_text_tool.py @@ -1,18 +1,20 @@ -from typing import List import math +from typing import List import pypdfium2 as pdfium -from pdftext.pdf.chars import get_chars, deduplicate_chars -from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks +from pdftext.pdf.chars import deduplicate_chars, get_chars +from pdftext.pdf.pages import assign_scripts, get_blocks, get_lines, get_spans + +from mineru.utils.pdfium_guard import pdfium_guard def get_page( page: pdfium.PdfPage, - quote_loosebox: bool =True, + quote_loosebox: bool = True, superscript_height_threshold: float = 0.7, line_distance_threshold: float = 0.1, ) -> dict: - + with pdfium_guard(): textpage = page.get_textpage() page_bbox: List[float] = page.get_bbox() page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) @@ -21,20 +23,29 @@ def get_page( page_rotation = 0 try: page_rotation = page.get_rotation() - except: + except Exception: pass - chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox)) - spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold) + chars = deduplicate_chars( + get_chars(textpage, page_bbox, page_rotation, quote_loosebox) + ) + spans = get_spans( + chars, + superscript_height_threshold=superscript_height_threshold, + line_distance_threshold=line_distance_threshold, + ) lines = get_lines(spans) - assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold) + assign_scripts( + lines, + height_threshold=superscript_height_threshold, + line_distance_threshold=line_distance_threshold, + ) blocks = get_blocks(lines) - page = { + return { "bbox": page_bbox, "width": page_width, "height": page_height, "rotation": page_rotation, - "blocks": blocks + "blocks": blocks, } - return page \ No newline at end of file