mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat: add environment variables for PDF rendering timeout and ONNX thread management
This commit is contained in:
@@ -100,3 +100,14 @@ Here are the environment variables and their descriptions:
|
||||
* Used to enable table merging functionality
|
||||
* Default is `true`, can be set to `false` via environment variable to disable table merging functionality.
|
||||
|
||||
- `MINERU_PDF_LOAD_IMAGES_TIMEOUT`:
|
||||
* Used to set the timeout period (in seconds) for rendering PDF to images
|
||||
* Default is `300` seconds, can be set to other values via environment variable to adjust the image rendering timeout.
|
||||
|
||||
- `MINERU_INTRA_OP_NUM_THREADS`:
|
||||
* Used to set the intra_op thread count for ONNX models, affects the computation speed of individual operators
|
||||
* Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count.
|
||||
|
||||
- `MINERU_INTER_OP_NUM_THREADS`:
|
||||
* Used to set the inter_op thread count for ONNX models, affects the parallel execution of multiple operators
|
||||
* Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count.
|
||||
|
||||
@@ -94,3 +94,15 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置,
|
||||
- `MINERU_TABLE_MERGE_ENABLE`:
|
||||
* 用于启用表格合并功能
|
||||
* 默认为`true`,可通过环境变量设置为`false`来禁用表格合并功能。
|
||||
|
||||
- `MINERU_PDF_LOAD_IMAGES_TIMEOUT`:
|
||||
* 用于设置将PDF渲染为图片的超时时间(秒)
|
||||
* 默认为`300`秒,可通过环境变量设置为其他值以调整渲染图片的超时时间。
|
||||
|
||||
- `MINERU_INTRA_OP_NUM_THREADS`:
|
||||
* 用于设置onnx模型的intra_op线程数,影响单个算子的计算速度
|
||||
* 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。
|
||||
|
||||
- `MINERU_INTER_OP_NUM_THREADS`:
|
||||
* 用于设置onnx模型的inter_op线程数,影响多个算子的并行执行
|
||||
* 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。
|
||||
|
||||
@@ -16,7 +16,7 @@ from typing import Any, Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from mineru.model.utils.onnx_config import get_op_num_threads
|
||||
from mineru.utils.os_env_config import get_op_num_threads
|
||||
from .table_structure_utils import (
|
||||
OrtInferSession,
|
||||
TableLabelDecode,
|
||||
|
||||
@@ -6,7 +6,7 @@ import cv2
|
||||
import numpy as np
|
||||
from skimage import measure
|
||||
|
||||
from mineru.model.utils.onnx_config import get_op_num_threads
|
||||
from mineru.utils.os_env_config import get_op_num_threads
|
||||
from .utils import OrtInferSession, resize_img
|
||||
from .utils_table_line_rec import (
|
||||
get_table_line,
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
import os
|
||||
|
||||
|
||||
def get_op_num_threads(env_name: str) -> int:
|
||||
env_value = os.getenv(env_name, None)
|
||||
return get_op_num_threads_from_value(env_value)
|
||||
|
||||
|
||||
def get_op_num_threads_from_value(env_value: str) -> int:
|
||||
if env_value is not None:
|
||||
try:
|
||||
num_threads = int(env_value)
|
||||
if num_threads > 0:
|
||||
return num_threads
|
||||
except ValueError:
|
||||
return -1
|
||||
return -1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(get_op_num_threads_from_value('1'))
|
||||
print(get_op_num_threads_from_value('0'))
|
||||
print(get_op_num_threads_from_value('-1'))
|
||||
print(get_op_num_threads_from_value('abc'))
|
||||
30
mineru/utils/os_env_config.py
Normal file
30
mineru/utils/os_env_config.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
|
||||
|
||||
def get_op_num_threads(env_name: str) -> int:
|
||||
env_value = os.getenv(env_name, None)
|
||||
return get_value_from_string(env_value, -1)
|
||||
|
||||
|
||||
def get_load_images_timeout() -> int:
|
||||
env_value = os.getenv('MINERU_PDF_LOAD_IMAGES_TIMEOUT', None)
|
||||
return get_value_from_string(env_value, 300)
|
||||
|
||||
|
||||
def get_value_from_string(env_value: str, default_value: int) -> int:
|
||||
if env_value is not None:
|
||||
try:
|
||||
num_threads = int(env_value)
|
||||
if num_threads > 0:
|
||||
return num_threads
|
||||
except ValueError:
|
||||
return default_value
|
||||
return default_value
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(get_value_from_string('1', -1))
|
||||
print(get_value_from_string('0', -1))
|
||||
print(get_value_from_string('-1', -1))
|
||||
print(get_value_from_string('abc', -1))
|
||||
print(get_load_images_timeout())
|
||||
@@ -9,6 +9,7 @@ from PIL import Image
|
||||
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.check_sys_env import is_windows_environment
|
||||
from mineru.utils.os_env_config import get_load_images_timeout
|
||||
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
||||
from mineru.utils.enum_class import ImageType
|
||||
from mineru.utils.hash_utils import str_sha256
|
||||
@@ -51,7 +52,7 @@ def load_images_from_pdf(
|
||||
start_page_id=0,
|
||||
end_page_id=None,
|
||||
image_type=ImageType.PIL,
|
||||
timeout=300,
|
||||
timeout=None,
|
||||
threads=4,
|
||||
):
|
||||
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
||||
@@ -79,6 +80,8 @@ def load_images_from_pdf(
|
||||
image_type
|
||||
), pdf_doc
|
||||
else:
|
||||
if timeout is None:
|
||||
timeout = get_load_images_timeout()
|
||||
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
||||
|
||||
# 计算总页数
|
||||
|
||||
Reference in New Issue
Block a user