mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat: refactor PDF handling to utilize pdfium_guard for resource management
This commit is contained in:
@@ -32,6 +32,11 @@ from mineru.utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_lis
|
||||
update_det_boxes, OcrConfidence
|
||||
from mineru.utils.pdf_classify import classify
|
||||
from mineru.utils.pdf_image_tools import load_images_from_pdf, load_images_from_pdf_doc
|
||||
from mineru.utils.pdfium_guard import (
|
||||
close_pdfium_document,
|
||||
get_pdfium_document_page_count,
|
||||
open_pdfium_document,
|
||||
)
|
||||
|
||||
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
|
||||
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
||||
@@ -618,15 +623,15 @@ def doc_analyze_low_memory(
|
||||
_ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method)
|
||||
_vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable)
|
||||
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
middle_json = init_middle_json(_ocr_enable, _vlm_ocr_enable)
|
||||
model_list = []
|
||||
doc_closed = False
|
||||
hybrid_pipeline_model = None
|
||||
try:
|
||||
page_count = len(pdf_doc)
|
||||
page_count = get_pdfium_document_page_count(pdf_doc)
|
||||
if page_count == 0:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
doc_closed = True
|
||||
clean_memory(device)
|
||||
return middle_json, model_list, _vlm_ocr_enable
|
||||
@@ -702,13 +707,13 @@ def doc_analyze_low_memory(
|
||||
_ocr_enable,
|
||||
_vlm_ocr_enable,
|
||||
)
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
doc_closed = True
|
||||
clean_memory(device)
|
||||
return middle_json, model_list, _vlm_ocr_enable
|
||||
finally:
|
||||
if not doc_closed:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
|
||||
|
||||
async def aio_doc_analyze(
|
||||
@@ -801,15 +806,15 @@ async def aio_doc_analyze_low_memory(
|
||||
_ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method)
|
||||
_vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable)
|
||||
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
middle_json = init_middle_json(_ocr_enable, _vlm_ocr_enable)
|
||||
model_list = []
|
||||
doc_closed = False
|
||||
hybrid_pipeline_model = None
|
||||
try:
|
||||
page_count = len(pdf_doc)
|
||||
page_count = get_pdfium_document_page_count(pdf_doc)
|
||||
if page_count == 0:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
doc_closed = True
|
||||
clean_memory(device)
|
||||
return middle_json, model_list, _vlm_ocr_enable
|
||||
@@ -885,10 +890,10 @@ async def aio_doc_analyze_low_memory(
|
||||
_ocr_enable,
|
||||
_vlm_ocr_enable,
|
||||
)
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
doc_closed = True
|
||||
clean_memory(device)
|
||||
return middle_json, model_list, _vlm_ocr_enable
|
||||
finally:
|
||||
if not doc_closed:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
|
||||
@@ -16,6 +16,7 @@ from mineru.utils.enum_class import ContentType
|
||||
from mineru.utils.hash_utils import bytes_md5
|
||||
from mineru.utils.ocr_utils import OcrConfidence, rotate_vertical_crop_if_needed
|
||||
from mineru.utils.pdf_image_tools import get_crop_img
|
||||
from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard
|
||||
from mineru.version import __version__
|
||||
|
||||
|
||||
@@ -47,7 +48,8 @@ def blocks_to_page_info(
|
||||
scale = image_dict["scale"]
|
||||
page_pil_img = image_dict["img_pil"]
|
||||
page_img_md5 = bytes_md5(page_pil_img.tobytes())
|
||||
width, height = map(int, page.get_size())
|
||||
with pdfium_guard():
|
||||
width, height = map(int, page.get_size())
|
||||
|
||||
magic_model = MagicModel(
|
||||
page_model_list,
|
||||
@@ -189,7 +191,8 @@ def append_page_results_to_middle_json(
|
||||
zip(model_list, images_list)
|
||||
):
|
||||
page_index = page_start_index + offset
|
||||
page = pdf_doc[page_index]
|
||||
with pdfium_guard():
|
||||
page = pdf_doc[page_index]
|
||||
page_info = blocks_to_page_info(
|
||||
page_model_list,
|
||||
image_dict,
|
||||
@@ -271,5 +274,5 @@ def result_to_middle_json(
|
||||
_ocr_enable,
|
||||
_vlm_ocr_enable,
|
||||
)
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
return middle_json
|
||||
|
||||
@@ -21,6 +21,7 @@ from mineru.backend.pipeline.pipeline_magic_model import MagicModel
|
||||
from mineru.utils.ocr_utils import OcrConfidence, rotate_vertical_crop_if_needed
|
||||
from mineru.version import __version__
|
||||
from mineru.utils.hash_utils import bytes_md5, str_sha256
|
||||
from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard
|
||||
|
||||
|
||||
def _save_base64_image(b64_data_uri: str, image_writer, page_index: int):
|
||||
@@ -89,7 +90,8 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
|
||||
scale = image_dict["scale"]
|
||||
page_pil_img = image_dict["img_pil"]
|
||||
page_img_md5 = bytes_md5(page_pil_img.tobytes())
|
||||
page_w, page_h = map(int, page.get_size())
|
||||
with pdfium_guard():
|
||||
page_w, page_h = map(int, page.get_size())
|
||||
magic_model = MagicModel(
|
||||
page_model_info,
|
||||
page,
|
||||
@@ -141,16 +143,19 @@ def append_page_model_infos_to_middle_json(
|
||||
):
|
||||
for offset, (page_model_info, image_dict) in enumerate(zip(page_model_infos, images_list)):
|
||||
page_index = page_start_index + offset
|
||||
with pdfium_guard():
|
||||
page = pdf_doc[page_index]
|
||||
page_info = page_model_info_to_page_info(
|
||||
copy.deepcopy(page_model_info),
|
||||
image_dict,
|
||||
pdf_doc[page_index],
|
||||
page,
|
||||
image_writer,
|
||||
page_index,
|
||||
ocr_enable=ocr_enable,
|
||||
)
|
||||
if page_info is None:
|
||||
page_w, page_h = map(int, pdf_doc[page_index].get_size())
|
||||
with pdfium_guard():
|
||||
page_w, page_h = map(int, pdf_doc[page_index].get_size())
|
||||
page_info = make_page_info_dict([], page_index, page_w, page_h, [])
|
||||
middle_json["pdf_info"].append(page_info)
|
||||
if progress_bar is not None:
|
||||
@@ -354,7 +359,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
|
||||
)
|
||||
|
||||
finalize_middle_json(middle_json["pdf_info"], lang=lang, ocr_enable=ocr_enable)
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
return middle_json
|
||||
|
||||
|
||||
|
||||
@@ -15,6 +15,11 @@ from ...utils.enum_class import ImageType
|
||||
from ...utils.pdf_classify import classify
|
||||
from ...utils.pdf_image_tools import load_images_from_pdf, load_images_from_pdf_doc
|
||||
from ...utils.model_utils import get_vram, clean_memory
|
||||
from ...utils.pdfium_guard import (
|
||||
close_pdfium_document,
|
||||
get_pdfium_document_page_count,
|
||||
open_pdfium_document,
|
||||
)
|
||||
|
||||
|
||||
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
|
||||
@@ -113,7 +118,7 @@ def _close_doc_context(context):
|
||||
if context['closed']:
|
||||
return
|
||||
try:
|
||||
context['pdf_doc'].close()
|
||||
close_pdfium_document(context['pdf_doc'])
|
||||
except Exception:
|
||||
pass
|
||||
_close_images(context['images_list'])
|
||||
@@ -138,7 +143,7 @@ def _finalize_low_memory_context(context, on_doc_ready):
|
||||
context['middle_json'],
|
||||
context['ocr_enable'],
|
||||
)
|
||||
context['pdf_doc'].close()
|
||||
close_pdfium_document(context['pdf_doc'])
|
||||
context['closed'] = True
|
||||
|
||||
|
||||
@@ -180,7 +185,7 @@ def doc_analyze(
|
||||
for pdf_doc in all_pdf_docs:
|
||||
if pdf_doc is not None:
|
||||
try:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
except Exception:
|
||||
pass
|
||||
for images_list in all_image_lists:
|
||||
@@ -377,8 +382,8 @@ def doc_analyze_low_memory_multi_streaming(
|
||||
total_pages = 0
|
||||
for doc_index, (pdf_bytes, image_writer, lang) in enumerate(zip(pdf_bytes_list, image_writer_list, lang_list)):
|
||||
_ocr_enable = _get_ocr_enable(pdf_bytes, parse_method)
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
page_count = len(pdf_doc)
|
||||
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
page_count = get_pdfium_document_page_count(pdf_doc)
|
||||
total_pages += page_count
|
||||
doc_contexts.append(
|
||||
{
|
||||
@@ -494,7 +499,7 @@ def doc_analyze_low_memory_multi_streaming(
|
||||
finally:
|
||||
for context in doc_contexts:
|
||||
if not context['closed']:
|
||||
context['pdf_doc'].close()
|
||||
close_pdfium_document(context['pdf_doc'])
|
||||
context['closed'] = True
|
||||
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from mineru.utils.cut_image import cut_image_and_table
|
||||
from mineru.utils.enum_class import ContentType
|
||||
from mineru.utils.hash_utils import bytes_md5
|
||||
from mineru.utils.pdf_image_tools import get_crop_img
|
||||
from mineru.utils.pdfium_guard import close_pdfium_document, pdfium_guard
|
||||
from mineru.version import __version__
|
||||
|
||||
|
||||
@@ -37,7 +38,8 @@ def blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index)
|
||||
# page_pil_img = image_dict["img_pil"]
|
||||
page_pil_img = image_dict["img_pil"]
|
||||
page_img_md5 = bytes_md5(page_pil_img.tobytes())
|
||||
width, height = map(int, page.get_size())
|
||||
with pdfium_guard():
|
||||
width, height = map(int, page.get_size())
|
||||
|
||||
magic_model = MagicModel(page_blocks, width, height)
|
||||
image_blocks = magic_model.get_image_blocks()
|
||||
@@ -115,7 +117,8 @@ def append_page_blocks_to_middle_json(
|
||||
):
|
||||
for offset, (page_blocks, image_dict) in enumerate(zip(model_output_blocks_list, images_list)):
|
||||
page_index = page_start_index + offset
|
||||
page = pdf_doc[page_index]
|
||||
with pdfium_guard():
|
||||
page = pdf_doc[page_index]
|
||||
page_info = blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index)
|
||||
middle_json["pdf_info"].append(page_info)
|
||||
if progress_bar is not None:
|
||||
@@ -146,5 +149,5 @@ def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_
|
||||
)
|
||||
|
||||
finalize_middle_json(middle_json["pdf_info"])
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
return middle_json
|
||||
|
||||
@@ -19,6 +19,11 @@ from ...utils.check_sys_env import is_mac_os_version_supported
|
||||
from ...utils.config_reader import get_device, get_low_memory_window_size
|
||||
|
||||
from ...utils.enum_class import ImageType
|
||||
from ...utils.pdfium_guard import (
|
||||
close_pdfium_document,
|
||||
get_pdfium_document_page_count,
|
||||
open_pdfium_document,
|
||||
)
|
||||
from ...utils.models_download_utils import auto_download_and_get_model_root_path
|
||||
|
||||
from mineru_vl_utils import MinerUClient
|
||||
@@ -322,14 +327,14 @@ def doc_analyze_low_memory(
|
||||
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
||||
predictor = _maybe_enable_serial_execution(predictor, backend)
|
||||
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
middle_json = init_middle_json()
|
||||
results = []
|
||||
doc_closed = False
|
||||
try:
|
||||
page_count = len(pdf_doc)
|
||||
page_count = get_pdfium_document_page_count(pdf_doc)
|
||||
if page_count == 0:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
doc_closed = True
|
||||
return middle_json, results
|
||||
window_size = min(page_count, get_low_memory_window_size(default=64))
|
||||
@@ -377,12 +382,12 @@ def doc_analyze_low_memory(
|
||||
f"speed: {round(len(results) / infer_time, 3)} page/s"
|
||||
)
|
||||
finalize_middle_json(middle_json["pdf_info"])
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
doc_closed = True
|
||||
return middle_json, results
|
||||
finally:
|
||||
if not doc_closed:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
|
||||
|
||||
async def aio_doc_analyze(
|
||||
@@ -426,14 +431,14 @@ async def aio_doc_analyze_low_memory(
|
||||
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
||||
predictor = _maybe_enable_serial_execution(predictor, backend)
|
||||
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
middle_json = init_middle_json()
|
||||
results = []
|
||||
doc_closed = False
|
||||
try:
|
||||
page_count = len(pdf_doc)
|
||||
page_count = get_pdfium_document_page_count(pdf_doc)
|
||||
if page_count == 0:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
doc_closed = True
|
||||
return middle_json, results
|
||||
window_size = min(page_count, get_low_memory_window_size(default=64))
|
||||
@@ -481,9 +486,9 @@ async def aio_doc_analyze_low_memory(
|
||||
f"speed: {round(len(results) / infer_time, 3)} page/s"
|
||||
)
|
||||
finalize_middle_json(middle_json["pdf_info"])
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
doc_closed = True
|
||||
return middle_json, results
|
||||
finally:
|
||||
if not doc_closed:
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
from io import BytesIO
|
||||
|
||||
import numpy as np
|
||||
@@ -16,6 +15,11 @@ from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from mineru.utils.pdfium_guard import (
|
||||
close_pdfium_document,
|
||||
open_pdfium_document,
|
||||
pdfium_guard,
|
||||
)
|
||||
|
||||
PDF_CLASSIFY_STRATEGY_ENV = "MINERU_PDF_CLASSIFY_STRATEGY"
|
||||
PDF_CLASSIFY_STRATEGY_HYBRID = "hybrid"
|
||||
@@ -33,9 +37,6 @@ _ALLOWED_CONTROL_CODES = {9, 10, 13}
|
||||
_PRIVATE_USE_AREA_START = 0xE000
|
||||
_PRIVATE_USE_AREA_END = 0xF8FF
|
||||
|
||||
_pdf_sample_extract_lock = threading.Lock()
|
||||
|
||||
|
||||
def classify(pdf_bytes):
|
||||
"""
|
||||
Classify a PDF as text-based or OCR-based.
|
||||
@@ -79,48 +80,48 @@ def classify_hybrid(pdf_bytes):
|
||||
should_run_pdfminer_fallback = False
|
||||
|
||||
try:
|
||||
pdf = pdfium.PdfDocument(pdf_bytes)
|
||||
page_count = len(pdf)
|
||||
if page_count == 0:
|
||||
return "ocr"
|
||||
|
||||
page_indices = get_sample_page_indices(page_count, MAX_SAMPLE_PAGES)
|
||||
if not page_indices:
|
||||
return "ocr"
|
||||
|
||||
if (
|
||||
get_avg_cleaned_chars_per_page_pdfium(pdf, page_indices)
|
||||
< CHARS_THRESHOLD
|
||||
):
|
||||
return "ocr"
|
||||
|
||||
if detect_cid_font_signal_pypdf(pdf_bytes, page_indices):
|
||||
return "ocr"
|
||||
|
||||
text_quality_signal = get_text_quality_signal_pdfium(pdf, page_indices)
|
||||
total_chars = text_quality_signal["total_chars"]
|
||||
abnormal_ratio = text_quality_signal["abnormal_ratio"]
|
||||
|
||||
if total_chars >= TEXT_QUALITY_MIN_CHARS:
|
||||
if abnormal_ratio >= TEXT_QUALITY_BAD_THRESHOLD:
|
||||
with pdfium_guard():
|
||||
pdf = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
page_count = len(pdf)
|
||||
if page_count == 0:
|
||||
return "ocr"
|
||||
should_run_pdfminer_fallback = abnormal_ratio > TEXT_QUALITY_GOOD_THRESHOLD
|
||||
else:
|
||||
should_run_pdfminer_fallback = True
|
||||
|
||||
if (
|
||||
get_high_image_coverage_ratio_pdfium(pdf, page_indices)
|
||||
>= HIGH_IMAGE_COVERAGE_THRESHOLD
|
||||
):
|
||||
return "ocr"
|
||||
page_indices = get_sample_page_indices(page_count, MAX_SAMPLE_PAGES)
|
||||
if not page_indices:
|
||||
return "ocr"
|
||||
|
||||
if (
|
||||
get_avg_cleaned_chars_per_page_pdfium(pdf, page_indices)
|
||||
< CHARS_THRESHOLD
|
||||
):
|
||||
return "ocr"
|
||||
|
||||
if detect_cid_font_signal_pypdf(pdf_bytes, page_indices):
|
||||
return "ocr"
|
||||
|
||||
text_quality_signal = get_text_quality_signal_pdfium(pdf, page_indices)
|
||||
total_chars = text_quality_signal["total_chars"]
|
||||
abnormal_ratio = text_quality_signal["abnormal_ratio"]
|
||||
|
||||
if total_chars >= TEXT_QUALITY_MIN_CHARS:
|
||||
if abnormal_ratio >= TEXT_QUALITY_BAD_THRESHOLD:
|
||||
return "ocr"
|
||||
should_run_pdfminer_fallback = abnormal_ratio > TEXT_QUALITY_GOOD_THRESHOLD
|
||||
else:
|
||||
should_run_pdfminer_fallback = True
|
||||
|
||||
if (
|
||||
get_high_image_coverage_ratio_pdfium(pdf, page_indices)
|
||||
>= HIGH_IMAGE_COVERAGE_THRESHOLD
|
||||
):
|
||||
return "ocr"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to classify PDF with hybrid strategy: {e}")
|
||||
return "ocr"
|
||||
|
||||
finally:
|
||||
if pdf is not None:
|
||||
pdf.close()
|
||||
close_pdfium_document(pdf)
|
||||
|
||||
if should_run_pdfminer_fallback:
|
||||
sample_pdf_bytes = extract_selected_pages(pdf_bytes, page_indices)
|
||||
@@ -140,33 +141,35 @@ def classify_legacy(pdf_bytes):
|
||||
sample_pdf_bytes = extract_pages(pdf_bytes)
|
||||
if not sample_pdf_bytes:
|
||||
return "ocr"
|
||||
pdf = pdfium.PdfDocument(sample_pdf_bytes)
|
||||
pdf = None
|
||||
try:
|
||||
page_count = len(pdf)
|
||||
if page_count == 0:
|
||||
return "ocr"
|
||||
with pdfium_guard():
|
||||
pdf = open_pdfium_document(pdfium.PdfDocument, sample_pdf_bytes)
|
||||
page_count = len(pdf)
|
||||
if page_count == 0:
|
||||
return "ocr"
|
||||
|
||||
pages_to_check = min(page_count, MAX_SAMPLE_PAGES)
|
||||
pages_to_check = min(page_count, MAX_SAMPLE_PAGES)
|
||||
|
||||
if (
|
||||
get_avg_cleaned_chars_per_page(pdf, pages_to_check) < CHARS_THRESHOLD
|
||||
) or detect_invalid_chars(sample_pdf_bytes):
|
||||
return "ocr"
|
||||
if (
|
||||
get_avg_cleaned_chars_per_page(pdf, pages_to_check) < CHARS_THRESHOLD
|
||||
) or detect_invalid_chars(sample_pdf_bytes):
|
||||
return "ocr"
|
||||
|
||||
if (
|
||||
get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check)
|
||||
>= HIGH_IMAGE_COVERAGE_THRESHOLD
|
||||
):
|
||||
return "ocr"
|
||||
if (
|
||||
get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check)
|
||||
>= HIGH_IMAGE_COVERAGE_THRESHOLD
|
||||
):
|
||||
return "ocr"
|
||||
|
||||
return "txt"
|
||||
return "txt"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to classify PDF with legacy strategy: {e}")
|
||||
logger.warning(f"Failed to classify PDF with legacy strategy: {e}")
|
||||
return "ocr"
|
||||
|
||||
finally:
|
||||
pdf.close()
|
||||
close_pdfium_document(pdf)
|
||||
|
||||
|
||||
def get_sample_page_indices(page_count: int, max_pages: int = MAX_SAMPLE_PAGES):
|
||||
@@ -402,11 +405,11 @@ def extract_pages(src_pdf_bytes: bytes) -> bytes:
|
||||
Extract up to 10 random pages and return them as a new PDF.
|
||||
"""
|
||||
|
||||
with _pdf_sample_extract_lock:
|
||||
pdf = None
|
||||
sample_docs = None
|
||||
try:
|
||||
pdf = pdfium.PdfDocument(src_pdf_bytes)
|
||||
pdf = None
|
||||
sample_docs = None
|
||||
try:
|
||||
with pdfium_guard():
|
||||
pdf = open_pdfium_document(pdfium.PdfDocument, src_pdf_bytes)
|
||||
total_page = len(pdf)
|
||||
if total_page == 0:
|
||||
logger.warning("PDF is empty, return empty document")
|
||||
@@ -420,20 +423,18 @@ def extract_pages(src_pdf_bytes: bytes) -> bytes:
|
||||
total_page, select_page_cnt, replace=False
|
||||
).tolist()
|
||||
|
||||
sample_docs = pdfium.PdfDocument.new()
|
||||
sample_docs = open_pdfium_document(pdfium.PdfDocument.new)
|
||||
sample_docs.import_pages(pdf, page_indices)
|
||||
|
||||
output_buffer = BytesIO()
|
||||
sample_docs.save(output_buffer)
|
||||
return output_buffer.getvalue()
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return src_pdf_bytes
|
||||
finally:
|
||||
if pdf is not None:
|
||||
pdf.close()
|
||||
if sample_docs is not None:
|
||||
sample_docs.close()
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return src_pdf_bytes
|
||||
finally:
|
||||
close_pdfium_document(pdf)
|
||||
close_pdfium_document(sample_docs)
|
||||
|
||||
|
||||
def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes:
|
||||
@@ -445,11 +446,11 @@ def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes:
|
||||
if not selected_page_indices:
|
||||
return b""
|
||||
|
||||
with _pdf_sample_extract_lock:
|
||||
pdf = None
|
||||
sample_docs = None
|
||||
try:
|
||||
pdf = pdfium.PdfDocument(src_pdf_bytes)
|
||||
pdf = None
|
||||
sample_docs = None
|
||||
try:
|
||||
with pdfium_guard():
|
||||
pdf = open_pdfium_document(pdfium.PdfDocument, src_pdf_bytes)
|
||||
total_page = len(pdf)
|
||||
if total_page == 0:
|
||||
logger.warning("PDF is empty, return empty document")
|
||||
@@ -466,20 +467,18 @@ def extract_selected_pages(src_pdf_bytes: bytes, page_indices) -> bytes:
|
||||
if selected_page_indices == list(range(total_page)):
|
||||
return src_pdf_bytes
|
||||
|
||||
sample_docs = pdfium.PdfDocument.new()
|
||||
sample_docs = open_pdfium_document(pdfium.PdfDocument.new)
|
||||
sample_docs.import_pages(pdf, selected_page_indices)
|
||||
|
||||
output_buffer = BytesIO()
|
||||
sample_docs.save(output_buffer)
|
||||
return output_buffer.getvalue()
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return src_pdf_bytes
|
||||
finally:
|
||||
if pdf is not None:
|
||||
pdf.close()
|
||||
if sample_docs is not None:
|
||||
sample_docs.close()
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return src_pdf_bytes
|
||||
finally:
|
||||
close_pdfium_document(pdf)
|
||||
close_pdfium_document(sample_docs)
|
||||
|
||||
|
||||
def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool:
|
||||
|
||||
@@ -17,6 +17,12 @@ from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_ima
|
||||
from mineru.utils.enum_class import ImageType
|
||||
from mineru.utils.hash_utils import str_sha256
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
from mineru.utils.pdfium_guard import (
|
||||
close_pdfium_document,
|
||||
get_pdfium_document_page_count,
|
||||
open_pdfium_document,
|
||||
pdfium_guard,
|
||||
)
|
||||
|
||||
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
|
||||
|
||||
@@ -130,23 +136,34 @@ def load_images_from_pdf(
|
||||
Raises:
|
||||
TimeoutError: 当转换超时时抛出
|
||||
"""
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
if is_windows_environment():
|
||||
# Windows 环境下不使用多进程
|
||||
return load_images_from_pdf_core(
|
||||
pdf_bytes,
|
||||
dpi,
|
||||
start_page_id,
|
||||
get_end_page_id(end_page_id, len(pdf_doc)),
|
||||
image_type,
|
||||
), pdf_doc
|
||||
try:
|
||||
images_list = load_images_from_pdf_core(
|
||||
pdf_bytes,
|
||||
dpi,
|
||||
start_page_id,
|
||||
get_end_page_id(
|
||||
end_page_id,
|
||||
get_pdfium_document_page_count(pdf_doc),
|
||||
),
|
||||
image_type,
|
||||
)
|
||||
return images_list, pdf_doc
|
||||
except Exception:
|
||||
close_pdfium_document(pdf_doc)
|
||||
raise
|
||||
else:
|
||||
if timeout is None:
|
||||
timeout = get_load_images_timeout()
|
||||
if threads is None:
|
||||
threads = get_load_images_threads()
|
||||
|
||||
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
||||
end_page_id = get_end_page_id(
|
||||
end_page_id,
|
||||
get_pdfium_document_page_count(pdf_doc),
|
||||
)
|
||||
actual_threads, page_ranges = _get_render_process_plan(
|
||||
start_page_id,
|
||||
end_page_id,
|
||||
@@ -179,7 +196,7 @@ def load_images_from_pdf(
|
||||
if not_done:
|
||||
# 超时:强制终止所有子进程
|
||||
_terminate_executor_processes(executor)
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
||||
|
||||
# 所有任务完成,收集结果
|
||||
@@ -201,7 +218,7 @@ def load_images_from_pdf(
|
||||
except Exception as e:
|
||||
# 发生任何异常时,确保清理子进程
|
||||
_terminate_executor_processes(executor)
|
||||
pdf_doc.close()
|
||||
close_pdfium_document(pdf_doc)
|
||||
if isinstance(e, TimeoutError):
|
||||
raise
|
||||
raise
|
||||
@@ -240,17 +257,20 @@ def load_images_from_pdf_core(
|
||||
image_type=ImageType.PIL, # PIL or BASE64
|
||||
):
|
||||
images_list = []
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
pdf_page_num = len(pdf_doc)
|
||||
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
|
||||
pdf_doc = None
|
||||
try:
|
||||
with pdfium_guard():
|
||||
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
pdf_page_num = len(pdf_doc)
|
||||
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
|
||||
|
||||
for index in range(start_page_id, end_page_id + 1):
|
||||
# logger.debug(f"Converting page {index}/{pdf_page_num} to image")
|
||||
page = pdf_doc[index]
|
||||
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
|
||||
images_list.append(image_dict)
|
||||
|
||||
pdf_doc.close()
|
||||
for index in range(start_page_id, end_page_id + 1):
|
||||
# logger.debug(f"Converting page {index}/{pdf_page_num} to image")
|
||||
page = pdf_doc[index]
|
||||
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
|
||||
images_list.append(image_dict)
|
||||
finally:
|
||||
close_pdfium_document(pdf_doc)
|
||||
|
||||
return images_list
|
||||
|
||||
@@ -263,13 +283,14 @@ def load_images_from_pdf_doc(
|
||||
image_type=ImageType.PIL,
|
||||
):
|
||||
images_list = []
|
||||
pdf_page_num = len(pdf_doc)
|
||||
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
|
||||
with pdfium_guard():
|
||||
pdf_page_num = len(pdf_doc)
|
||||
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
|
||||
|
||||
for index in range(start_page_id, end_page_id + 1):
|
||||
page = pdf_doc[index]
|
||||
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
|
||||
images_list.append(image_dict)
|
||||
for index in range(start_page_id, end_page_id + 1):
|
||||
page = pdf_doc[index]
|
||||
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
|
||||
images_list.append(image_dict)
|
||||
|
||||
return images_list
|
||||
|
||||
@@ -354,4 +375,4 @@ def images_bytes_to_pdf_bytes(image_bytes):
|
||||
# 获取 PDF bytes 并重置指针(可选)
|
||||
pdf_bytes = pdf_buffer.getvalue()
|
||||
pdf_buffer.close()
|
||||
return pdf_bytes
|
||||
return pdf_bytes
|
||||
|
||||
@@ -5,6 +5,11 @@ from io import BytesIO
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
from pypdfium2 import PdfBitmap, PdfDocument, PdfPage
|
||||
from mineru.utils.pdfium_guard import (
|
||||
close_pdfium_document,
|
||||
open_pdfium_document,
|
||||
pdfium_guard,
|
||||
)
|
||||
|
||||
|
||||
def page_to_image(
|
||||
@@ -12,19 +17,20 @@ def page_to_image(
|
||||
dpi: int = 200,
|
||||
max_width_or_height: int = 3500, # changed from 4500 to 3500
|
||||
) -> (Image.Image, float):
|
||||
scale = dpi / 72
|
||||
with pdfium_guard():
|
||||
scale = dpi / 72
|
||||
|
||||
long_side_length = max(*page.get_size())
|
||||
if (long_side_length*scale) > max_width_or_height:
|
||||
scale = max_width_or_height / long_side_length
|
||||
long_side_length = max(*page.get_size())
|
||||
if (long_side_length*scale) > max_width_or_height:
|
||||
scale = max_width_or_height / long_side_length
|
||||
|
||||
bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
|
||||
bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
|
||||
|
||||
image = bitmap.to_pil()
|
||||
try:
|
||||
bitmap.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to close bitmap: {e}")
|
||||
image = bitmap.to_pil()
|
||||
try:
|
||||
bitmap.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to close bitmap: {e}")
|
||||
return image, scale
|
||||
|
||||
|
||||
@@ -65,25 +71,23 @@ def pdf_to_images(
|
||||
start_page_id: int = 0,
|
||||
end_page_id: int | None = None,
|
||||
) -> list[Image.Image]:
|
||||
doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf)
|
||||
page_num = len(doc)
|
||||
|
||||
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
|
||||
if end_page_id > page_num - 1:
|
||||
logger.warning("end_page_id is out of range, use images length")
|
||||
end_page_id = page_num - 1
|
||||
|
||||
images = []
|
||||
doc = pdf if isinstance(pdf, PdfDocument) else open_pdfium_document(PdfDocument, pdf)
|
||||
try:
|
||||
for i in range(start_page_id, end_page_id + 1):
|
||||
image, _ = page_to_image(doc[i], dpi, max_width_or_height)
|
||||
images.append(image)
|
||||
with pdfium_guard():
|
||||
page_num = len(doc)
|
||||
|
||||
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
|
||||
if end_page_id > page_num - 1:
|
||||
logger.warning("end_page_id is out of range, use images length")
|
||||
end_page_id = page_num - 1
|
||||
|
||||
images = []
|
||||
for i in range(start_page_id, end_page_id + 1):
|
||||
image, _ = page_to_image(doc[i], dpi, max_width_or_height)
|
||||
images.append(image)
|
||||
return images
|
||||
finally:
|
||||
try:
|
||||
doc.close()
|
||||
except Exception:
|
||||
pass
|
||||
return images
|
||||
close_pdfium_document(doc)
|
||||
|
||||
|
||||
def pdf_to_images_bytes(
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
from typing import List
|
||||
import math
|
||||
from typing import List
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from pdftext.pdf.chars import get_chars, deduplicate_chars
|
||||
from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks
|
||||
from pdftext.pdf.chars import deduplicate_chars, get_chars
|
||||
from pdftext.pdf.pages import assign_scripts, get_blocks, get_lines, get_spans
|
||||
|
||||
from mineru.utils.pdfium_guard import pdfium_guard
|
||||
|
||||
|
||||
def get_page(
|
||||
page: pdfium.PdfPage,
|
||||
quote_loosebox: bool =True,
|
||||
quote_loosebox: bool = True,
|
||||
superscript_height_threshold: float = 0.7,
|
||||
line_distance_threshold: float = 0.1,
|
||||
) -> dict:
|
||||
|
||||
with pdfium_guard():
|
||||
textpage = page.get_textpage()
|
||||
page_bbox: List[float] = page.get_bbox()
|
||||
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
|
||||
@@ -21,20 +23,29 @@ def get_page(
|
||||
page_rotation = 0
|
||||
try:
|
||||
page_rotation = page.get_rotation()
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
|
||||
spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
|
||||
chars = deduplicate_chars(
|
||||
get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
|
||||
)
|
||||
spans = get_spans(
|
||||
chars,
|
||||
superscript_height_threshold=superscript_height_threshold,
|
||||
line_distance_threshold=line_distance_threshold,
|
||||
)
|
||||
lines = get_lines(spans)
|
||||
assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
|
||||
assign_scripts(
|
||||
lines,
|
||||
height_threshold=superscript_height_threshold,
|
||||
line_distance_threshold=line_distance_threshold,
|
||||
)
|
||||
blocks = get_blocks(lines)
|
||||
|
||||
page = {
|
||||
return {
|
||||
"bbox": page_bbox,
|
||||
"width": page_width,
|
||||
"height": page_height,
|
||||
"rotation": page_rotation,
|
||||
"blocks": blocks
|
||||
"blocks": blocks,
|
||||
}
|
||||
return page
|
||||
Reference in New Issue
Block a user