From 5de8f1a19f06551a4549806a5621ebd5dddaff1f Mon Sep 17 00:00:00 2001 From: myhloli Date: Tue, 4 Nov 2025 19:47:59 +0800 Subject: [PATCH] feat: add environment variables for PDF rendering timeout and ONNX thread management --- docs/en/usage/cli_tools.md | 11 +++++++ docs/zh/usage/cli_tools.md | 12 ++++++++ .../table/rec/slanet_plus/table_structure.py | 2 +- .../rec/unet_table/table_structure_unet.py | 2 +- mineru/model/utils/onnx_config.py | 24 --------------- mineru/utils/os_env_config.py | 30 +++++++++++++++++++ mineru/utils/pdf_image_tools.py | 5 +++- 7 files changed, 59 insertions(+), 27 deletions(-) delete mode 100644 mineru/model/utils/onnx_config.py create mode 100644 mineru/utils/os_env_config.py diff --git a/docs/en/usage/cli_tools.md b/docs/en/usage/cli_tools.md index 14d81408..5b81027e 100644 --- a/docs/en/usage/cli_tools.md +++ b/docs/en/usage/cli_tools.md @@ -100,3 +100,14 @@ Here are the environment variables and their descriptions: * Used to enable table merging functionality * Default is `true`, can be set to `false` via environment variable to disable table merging functionality. +- `MINERU_PDF_LOAD_IMAGES_TIMEOUT`: + * Used to set the timeout period (in seconds) for rendering PDF to images + * Default is `300` seconds, can be set to other values via environment variable to adjust the image rendering timeout. + +- `MINERU_INTRA_OP_NUM_THREADS`: + * Used to set the intra_op thread count for ONNX models, affects the computation speed of individual operators + * Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count. + +- `MINERU_INTER_OP_NUM_THREADS`: + * Used to set the inter_op thread count for ONNX models, affects the parallel execution of multiple operators + * Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count. diff --git a/docs/zh/usage/cli_tools.md b/docs/zh/usage/cli_tools.md index 99f0c370..15d4d2eb 100644 --- a/docs/zh/usage/cli_tools.md +++ b/docs/zh/usage/cli_tools.md @@ -94,3 +94,15 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置, - `MINERU_TABLE_MERGE_ENABLE`: * 用于启用表格合并功能 * 默认为`true`,可通过环境变量设置为`false`来禁用表格合并功能。 + +- `MINERU_PDF_LOAD_IMAGES_TIMEOUT`: + * 用于设置将PDF渲染为图片的超时时间(秒) + * 默认为`300`秒,可通过环境变量设置为其他值以调整渲染图片的超时时间。 + +- `MINERU_INTRA_OP_NUM_THREADS`: + * 用于设置onnx模型的intra_op线程数,影响单个算子的计算速度 + * 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。 + +- `MINERU_INTER_OP_NUM_THREADS`: + * 用于设置onnx模型的inter_op线程数,影响多个算子的并行执行 + * 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。 diff --git a/mineru/model/table/rec/slanet_plus/table_structure.py b/mineru/model/table/rec/slanet_plus/table_structure.py index 99c704bc..d9b2f0ce 100644 --- a/mineru/model/table/rec/slanet_plus/table_structure.py +++ b/mineru/model/table/rec/slanet_plus/table_structure.py @@ -16,7 +16,7 @@ from typing import Any, Dict, List, Tuple import numpy as np -from mineru.model.utils.onnx_config import get_op_num_threads +from mineru.utils.os_env_config import get_op_num_threads from .table_structure_utils import ( OrtInferSession, TableLabelDecode, diff --git a/mineru/model/table/rec/unet_table/table_structure_unet.py b/mineru/model/table/rec/unet_table/table_structure_unet.py index 642dee2d..d20ced76 100644 --- a/mineru/model/table/rec/unet_table/table_structure_unet.py +++ b/mineru/model/table/rec/unet_table/table_structure_unet.py @@ -6,7 +6,7 @@ import cv2 import numpy as np from skimage import measure -from mineru.model.utils.onnx_config import get_op_num_threads +from mineru.utils.os_env_config import get_op_num_threads from .utils import OrtInferSession, resize_img from .utils_table_line_rec import ( get_table_line, diff --git a/mineru/model/utils/onnx_config.py b/mineru/model/utils/onnx_config.py deleted file mode 100644 index 492ea743..00000000 --- a/mineru/model/utils/onnx_config.py +++ /dev/null @@ -1,24 +0,0 @@ -import os - - -def get_op_num_threads(env_name: str) -> int: - env_value = os.getenv(env_name, None) - return get_op_num_threads_from_value(env_value) - - -def get_op_num_threads_from_value(env_value: str) -> int: - if env_value is not None: - try: - num_threads = int(env_value) - if num_threads > 0: - return num_threads - except ValueError: - return -1 - return -1 - - -if __name__ == '__main__': - print(get_op_num_threads_from_value('1')) - print(get_op_num_threads_from_value('0')) - print(get_op_num_threads_from_value('-1')) - print(get_op_num_threads_from_value('abc')) \ No newline at end of file diff --git a/mineru/utils/os_env_config.py b/mineru/utils/os_env_config.py new file mode 100644 index 00000000..684976ca --- /dev/null +++ b/mineru/utils/os_env_config.py @@ -0,0 +1,30 @@ +import os + + +def get_op_num_threads(env_name: str) -> int: + env_value = os.getenv(env_name, None) + return get_value_from_string(env_value, -1) + + +def get_load_images_timeout() -> int: + env_value = os.getenv('MINERU_PDF_LOAD_IMAGES_TIMEOUT', None) + return get_value_from_string(env_value, 300) + + +def get_value_from_string(env_value: str, default_value: int) -> int: + if env_value is not None: + try: + num_threads = int(env_value) + if num_threads > 0: + return num_threads + except ValueError: + return default_value + return default_value + + +if __name__ == '__main__': + print(get_value_from_string('1', -1)) + print(get_value_from_string('0', -1)) + print(get_value_from_string('-1', -1)) + print(get_value_from_string('abc', -1)) + print(get_load_images_timeout()) \ No newline at end of file diff --git a/mineru/utils/pdf_image_tools.py b/mineru/utils/pdf_image_tools.py index 5bbaeadb..f8709926 100644 --- a/mineru/utils/pdf_image_tools.py +++ b/mineru/utils/pdf_image_tools.py @@ -9,6 +9,7 @@ from PIL import Image from mineru.data.data_reader_writer import FileBasedDataWriter from mineru.utils.check_sys_env import is_windows_environment +from mineru.utils.os_env_config import get_load_images_timeout from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image from mineru.utils.enum_class import ImageType from mineru.utils.hash_utils import str_sha256 @@ -51,7 +52,7 @@ def load_images_from_pdf( start_page_id=0, end_page_id=None, image_type=ImageType.PIL, - timeout=300, + timeout=None, threads=4, ): """带超时控制的 PDF 转图片函数,支持多进程加速 @@ -79,6 +80,8 @@ def load_images_from_pdf( image_type ), pdf_doc else: + if timeout is None: + timeout = get_load_images_timeout() end_page_id = get_end_page_id(end_page_id, len(pdf_doc)) # 计算总页数