feat: add environment variables for PDF rendering timeout and ONNX thread management

This commit is contained in:
myhloli
2025-11-04 19:47:59 +08:00
parent be2369bdd4
commit 5de8f1a19f
7 changed files with 59 additions and 27 deletions

View File

@@ -100,3 +100,14 @@ Here are the environment variables and their descriptions:
* Used to enable table merging functionality
* Default is `true`, can be set to `false` via environment variable to disable table merging functionality.
- `MINERU_PDF_LOAD_IMAGES_TIMEOUT`:
* Used to set the timeout period (in seconds) for rendering PDF to images
* Default is `300` seconds, can be set to other values via environment variable to adjust the image rendering timeout.
- `MINERU_INTRA_OP_NUM_THREADS`:
* Used to set the intra_op thread count for ONNX models, affects the computation speed of individual operators
* Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count.
- `MINERU_INTER_OP_NUM_THREADS`:
* Used to set the inter_op thread count for ONNX models, affects the parallel execution of multiple operators
* Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count.

View File

@@ -94,3 +94,15 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置
- `MINERU_TABLE_MERGE_ENABLE`
* 用于启用表格合并功能
* 默认为`true`,可通过环境变量设置为`false`来禁用表格合并功能。
- `MINERU_PDF_LOAD_IMAGES_TIMEOUT`
* 用于设置将PDF渲染为图片的超时时间
* 默认为`300`秒,可通过环境变量设置为其他值以调整渲染图片的超时时间。
- `MINERU_INTRA_OP_NUM_THREADS`
* 用于设置onnx模型的intra_op线程数影响单个算子的计算速度
* 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。
- `MINERU_INTER_OP_NUM_THREADS`
* 用于设置onnx模型的inter_op线程数影响多个算子的并行执行
* 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。

View File

@@ -16,7 +16,7 @@ from typing import Any, Dict, List, Tuple
import numpy as np
from mineru.model.utils.onnx_config import get_op_num_threads
from mineru.utils.os_env_config import get_op_num_threads
from .table_structure_utils import (
OrtInferSession,
TableLabelDecode,

View File

@@ -6,7 +6,7 @@ import cv2
import numpy as np
from skimage import measure
from mineru.model.utils.onnx_config import get_op_num_threads
from mineru.utils.os_env_config import get_op_num_threads
from .utils import OrtInferSession, resize_img
from .utils_table_line_rec import (
get_table_line,

View File

@@ -1,24 +0,0 @@
import os
def get_op_num_threads(env_name: str) -> int:
env_value = os.getenv(env_name, None)
return get_op_num_threads_from_value(env_value)
def get_op_num_threads_from_value(env_value: str) -> int:
if env_value is not None:
try:
num_threads = int(env_value)
if num_threads > 0:
return num_threads
except ValueError:
return -1
return -1
if __name__ == '__main__':
print(get_op_num_threads_from_value('1'))
print(get_op_num_threads_from_value('0'))
print(get_op_num_threads_from_value('-1'))
print(get_op_num_threads_from_value('abc'))

View File

@@ -0,0 +1,30 @@
import os
def get_op_num_threads(env_name: str) -> int:
env_value = os.getenv(env_name, None)
return get_value_from_string(env_value, -1)
def get_load_images_timeout() -> int:
env_value = os.getenv('MINERU_PDF_LOAD_IMAGES_TIMEOUT', None)
return get_value_from_string(env_value, 300)
def get_value_from_string(env_value: str, default_value: int) -> int:
if env_value is not None:
try:
num_threads = int(env_value)
if num_threads > 0:
return num_threads
except ValueError:
return default_value
return default_value
if __name__ == '__main__':
print(get_value_from_string('1', -1))
print(get_value_from_string('0', -1))
print(get_value_from_string('-1', -1))
print(get_value_from_string('abc', -1))
print(get_load_images_timeout())

View File

@@ -9,6 +9,7 @@ from PIL import Image
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.check_sys_env import is_windows_environment
from mineru.utils.os_env_config import get_load_images_timeout
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
from mineru.utils.enum_class import ImageType
from mineru.utils.hash_utils import str_sha256
@@ -51,7 +52,7 @@ def load_images_from_pdf(
start_page_id=0,
end_page_id=None,
image_type=ImageType.PIL,
timeout=300,
timeout=None,
threads=4,
):
"""带超时控制的 PDF 转图片函数,支持多进程加速
@@ -79,6 +80,8 @@ def load_images_from_pdf(
image_type
), pdf_doc
else:
if timeout is None:
timeout = get_load_images_timeout()
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
# 计算总页数