diff --git a/docker/china/mlu.Dockerfile b/docker/china/mlu.Dockerfile index cf679f1c..f9dd9382 100644 --- a/docker/china/mlu.Dockerfile +++ b/docker/china/mlu.Dockerfile @@ -1,6 +1,6 @@ # 基础镜像配置 vLLM 或 LMDeploy ,请根据实际需要选择其中一个,要求 amd64(x86-64) CPU + Cambricon MLU. # Base image containing the LMDEPLOY inference environment, requiring amd64(x86-64) CPU + Cambricon MLU. -FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/camb:qwen_vl2.5 +FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/camb:qwen2.5_vl ARG BACKEND=lmdeploy # Base image containing the vLLM inference environment, requiring amd64(x86-64) CPU + Cambricon MLU. # FROM crpi-vofi3w62lkohhxsp.cn-shanghai.personal.cr.aliyuncs.com/opendatalab-mineru/mlu:vllm0.8.3-torch2.6.0-torchmlu1.26.1-ubuntu22.04-py310 diff --git a/docs/en/usage/cli_tools.md b/docs/en/usage/cli_tools.md index 00e65cb8..0b9ef302 100644 --- a/docs/en/usage/cli_tools.md +++ b/docs/en/usage/cli_tools.md @@ -77,7 +77,8 @@ Here are the environment variables and their descriptions: - `MINERU_MODEL_SOURCE`: * Used to specify model source * supports `huggingface/modelscope/local` - * defaults to `huggingface`, can be switched to `modelscope` or local models through environment variables. + * Default is `huggingface`; you can switch via an environment variable to `modelscope` to use a domestic acceleration mirror, or switch to `local` to use a local model. + - `MINERU_TOOLS_CONFIG_JSON`: * Used to specify configuration file path @@ -101,8 +102,14 @@ Here are the environment variables and their descriptions: * Default is `true`, can be set to `false` via environment variable to disable table merging functionality. - `MINERU_PDF_RENDER_TIMEOUT`: - * Used to set the timeout period (in seconds) for rendering PDF to images - * Default is `300` seconds, can be set to other values via environment variable to adjust the image rendering timeout. + * Used to set the timeout (in seconds) for rendering PDFs to images. + * Default is `300` seconds; you can set a different value via an environment variable to adjust the rendering timeout. + * Only effective on Linux and macOS systems. + +- `MINERU_PDF_RENDER_THREADS`: + * Used to set the number of threads used when rendering PDFs to images. + * Default is `4`; you can set a different value via an environment variable to adjust the number of threads for image rendering. + * Only effective on Linux and macOS systems. - `MINERU_INTRA_OP_NUM_THREADS`: * Used to set the intra_op thread count for ONNX models, affects the computation speed of individual operators diff --git a/docs/zh/usage/acceleration_cards/Ascend.md b/docs/zh/usage/acceleration_cards/Ascend.md index 739cb319..0ae863ff 100644 --- a/docs/zh/usage/acceleration_cards/Ascend.md +++ b/docs/zh/usage/acceleration_cards/Ascend.md @@ -175,4 +175,5 @@ docker run -u root --name mineru_docker --privileged=true \ 🔴: 不支持,无法运行,或精度存在较大差异 >[!TIP] ->NPU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[ASCEND_RT_VISIBLE_DEVICES](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850alpha001/maintenref/envvar/envref_07_0028.html) \ No newline at end of file +> - NPU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[ASCEND_RT_VISIBLE_DEVICES](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850alpha001/maintenref/envvar/envref_07_0028.html) +> - 在Ascend平台可以通过`npu-smi info`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。 \ No newline at end of file diff --git a/docs/zh/usage/acceleration_cards/Cambricon.md b/docs/zh/usage/acceleration_cards/Cambricon.md index e13af1d4..254c5291 100644 --- a/docs/zh/usage/acceleration_cards/Cambricon.md +++ b/docs/zh/usage/acceleration_cards/Cambricon.md @@ -36,24 +36,11 @@ docker run --name mineru_docker \ --privileged \ --ipc=host \ --network=host \ - --cap-add SYS_PTRACE \ - --device=/dev/mem \ - --device=/dev/dri \ - --device=/dev/infiniband \ - --device=/dev/cambricon_ctl \ - --device=/dev/cambricon_dev0 \ - --device=/dev/cambricon_dev1 \ - --device=/dev/cambricon_dev2 \ - --device=/dev/cambricon_dev3 \ - --device=/dev/cambricon_dev4 \ - --device=/dev/cambricon_dev5 \ - --device=/dev/cambricon_dev6 \ - --device=/dev/cambricon_dev7 \ - --group-add video \ --shm-size=400g \ --ulimit memlock=-1 \ - --security-opt seccomp=unconfined \ - --security-opt apparmor=unconfined \ + -v /dev:/dev \ + -v /lib/modules:/lib/modules:ro \ + -v /usr/bin/cnmon:/usr/bin/cnmon \ -e MINERU_MODEL_SOURCE=local \ -e MINERU_LMDEPLOY_DEVICE=camb \ -it mineru:mlu-lmdeploy-latest \ @@ -86,7 +73,7 @@ docker run --name mineru_docker \ 不同环境下,MinerU对Cambricon加速卡的支持情况如下表所示: >[!TIP] -> - `lmdeploy`黄灯问题为不能批量输出文件夹,单文件输入正常 +> - `lmdeploy`黄灯问题为不能输入文件夹使用批量解析功能,输入单个文件时表现正常。 > - `vllm`黄灯问题为在精度未对齐,在部分场景下可能出现预期外结果。 @@ -168,5 +155,6 @@ docker run --name mineru_docker \ 🔴: 不支持,无法运行,或精度存在较大差异 >[!TIP] ->Cambricon加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明, +> - Cambricon加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明, >将环境变量`CUDA_VISIBLE_DEVICES`替换为`MLU_VISIBLE_DEVICES`即可。 +> - 在Cambricon平台可以通过`cnmon`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。 \ No newline at end of file diff --git a/docs/zh/usage/acceleration_cards/Enflame.md b/docs/zh/usage/acceleration_cards/Enflame.md index c92e1e16..fdd86c06 100644 --- a/docs/zh/usage/acceleration_cards/Enflame.md +++ b/docs/zh/usage/acceleration_cards/Enflame.md @@ -105,5 +105,6 @@ docker run -u root --name mineru_docker \ 🔴: 不支持,无法运行,或精度存在较大差异 >[!TIP] ->GCU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明, ->将环境变量`CUDA_VISIBLE_DEVICES`替换为`TOPS_VISIBLE_DEVICES`即可。 \ No newline at end of file +> - GCU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明, +>将环境变量`CUDA_VISIBLE_DEVICES`替换为`TOPS_VISIBLE_DEVICES`即可。 +> - 在Enflame平台可以通过`efsmi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。 \ No newline at end of file diff --git a/docs/zh/usage/acceleration_cards/Hygon.md b/docs/zh/usage/acceleration_cards/Hygon.md index 9b3e2da3..286bedcf 100644 --- a/docs/zh/usage/acceleration_cards/Hygon.md +++ b/docs/zh/usage/acceleration_cards/Hygon.md @@ -2,7 +2,7 @@ 以下为本指南测试使用的平台信息,供参考: ``` os: Ubuntu 22.04.3 LTS -cpu: Hygon Hygon C86-4G(x86-64) +cpu: Hygon C86-4G(x86-64) dcu: BW200 driver: 6.3.13-V1.12.0a docker: 20.10.24 @@ -112,4 +112,5 @@ docker run -u root --name mineru_docker \ 🔴: 不支持,无法运行,或精度存在较大差异 >[!TIP] ->DCU加速卡指定可用加速卡的方式与AMD GPU类似,请参考[GPU isolation techniques](https://rocm.docs.amd.com/en/docs-6.2.4/conceptual/gpu-isolation.html) \ No newline at end of file +> - DCU加速卡指定可用加速卡的方式与AMD GPU类似,请参考[GPU isolation techniques](https://rocm.docs.amd.com/en/docs-6.2.4/conceptual/gpu-isolation.html) +> - 在Hygon平台可以通过`hy-smi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。 \ No newline at end of file diff --git a/docs/zh/usage/acceleration_cards/IluvatarCorex.md b/docs/zh/usage/acceleration_cards/IluvatarCorex.md index 58566bfd..9c550a87 100644 --- a/docs/zh/usage/acceleration_cards/IluvatarCorex.md +++ b/docs/zh/usage/acceleration_cards/IluvatarCorex.md @@ -119,4 +119,5 @@ docker run --name mineru_docker \ 🔴: 不支持,无法运行,或精度存在较大差异 >[!TIP] ->Iluvatar加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明 \ No newline at end of file +> - Iluvatar加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明 +> - 在Iluvatar平台可以通过`ixsmi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。 \ No newline at end of file diff --git a/docs/zh/usage/acceleration_cards/METAX.md b/docs/zh/usage/acceleration_cards/METAX.md index 95c46eec..e61b2547 100644 --- a/docs/zh/usage/acceleration_cards/METAX.md +++ b/docs/zh/usage/acceleration_cards/METAX.md @@ -148,4 +148,5 @@ docker run --ipc host \ 🔴: 不支持,无法运行,或精度存在较大差异 >[!TIP] ->MACA加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明。 \ No newline at end of file +> - MACA加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明。 +> - 在METAX平台可以通过`mx-smi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。 \ No newline at end of file diff --git a/docs/zh/usage/acceleration_cards/MooreThreads.md b/docs/zh/usage/acceleration_cards/MooreThreads.md index 53dbdfa2..981fc26c 100644 --- a/docs/zh/usage/acceleration_cards/MooreThreads.md +++ b/docs/zh/usage/acceleration_cards/MooreThreads.md @@ -112,4 +112,5 @@ docker run -u root --name mineru_docker \ 🔴: 不支持,无法运行,或精度存在较大差异 >[!TIP] ->MooreThreads加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[GPU 枚举](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/install_guide/#gpu-%E6%9E%9A%E4%B8%BE) \ No newline at end of file +> - MooreThreads加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[GPU 枚举](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/install_guide/#gpu-%E6%9E%9A%E4%B8%BE) +> - 在MooreThreads平台可以通过`mthreads-gmi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。 \ No newline at end of file diff --git a/docs/zh/usage/acceleration_cards/THead.md b/docs/zh/usage/acceleration_cards/THead.md index e1ca16e3..ac916095 100644 --- a/docs/zh/usage/acceleration_cards/THead.md +++ b/docs/zh/usage/acceleration_cards/THead.md @@ -139,4 +139,5 @@ docker run --privileged=true \ 🔴: 不支持,无法运行,或精度存在较大差异 >[!TIP] ->PPU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明。 \ No newline at end of file +> - PPU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明。 +> - 在T-Head平台可以通过`ppu-smi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。 \ No newline at end of file diff --git a/docs/zh/usage/cli_tools.md b/docs/zh/usage/cli_tools.md index c43f5f9f..0b386d44 100644 --- a/docs/zh/usage/cli_tools.md +++ b/docs/zh/usage/cli_tools.md @@ -72,7 +72,7 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置, - `MINERU_MODEL_SOURCE`: * 用于指定模型来源 * 支持`huggingface/modelscope/local` - * 默认为`huggingface`可通过环境变量切换为`modelscope`或使用本地模型。 + * 默认为`huggingface`可通过环境变量切换为`modelscope`使用国内加速源或切换至`local`以使用本地模型。 - `MINERU_TOOLS_CONFIG_JSON`: * 用于指定配置文件路径 @@ -98,6 +98,12 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置, - `MINERU_PDF_RENDER_TIMEOUT`: * 用于设置将PDF渲染为图片的超时时间(秒) * 默认为`300`秒,可通过环境变量设置为其他值以调整渲染图片的超时时间。 + * 仅在linux和macOS系统中生效。 + +- `MINERU_PDF_RENDER_THREADS`: + * 用于设置将PDF渲染为图片时使用的线程数 + * 默认为`4`,可通过环境变量设置为其他值以调整渲染图片时的线程数。 + * 仅在linux和macOS系统中生效。 - `MINERU_INTRA_OP_NUM_THREADS`: * 用于设置onnx模型的intra_op线程数,影响单个算子的计算速度 diff --git a/mineru/utils/os_env_config.py b/mineru/utils/os_env_config.py index 43d01334..124bb775 100644 --- a/mineru/utils/os_env_config.py +++ b/mineru/utils/os_env_config.py @@ -11,6 +11,11 @@ def get_load_images_timeout() -> int: return get_value_from_string(env_value, 300) +def get_load_images_threads() -> int: + env_value = os.getenv('MINERU_PDF_RENDER_THREADS', None) + return get_value_from_string(env_value, 4) + + def get_value_from_string(env_value: str, default_value: int) -> int: if env_value is not None: try: diff --git a/mineru/utils/pdf_image_tools.py b/mineru/utils/pdf_image_tools.py index 591798da..20a8cad6 100644 --- a/mineru/utils/pdf_image_tools.py +++ b/mineru/utils/pdf_image_tools.py @@ -1,5 +1,7 @@ # Copyright (c) Opendatalab. All rights reserved. import os +import signal +import time from io import BytesIO import numpy as np @@ -9,13 +11,13 @@ from PIL import Image, ImageOps from mineru.data.data_reader_writer import FileBasedDataWriter from mineru.utils.check_sys_env import is_windows_environment -from mineru.utils.os_env_config import get_load_images_timeout +from mineru.utils.os_env_config import get_load_images_timeout, get_load_images_threads from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image from mineru.utils.enum_class import ImageType from mineru.utils.hash_utils import str_sha256 from mineru.utils.pdf_page_id import get_end_page_id -from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError +from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict: @@ -57,7 +59,7 @@ def load_images_from_pdf( end_page_id=None, image_type=ImageType.PIL, timeout=None, - threads=4, + threads=None, ): """带超时控制的 PDF 转图片函数,支持多进程加速 @@ -67,8 +69,8 @@ def load_images_from_pdf( start_page_id (int, optional): 起始页码. Defaults to 0. end_page_id (int | None, optional): 结束页码. Defaults to None. image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL. - timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。 - threads (int): 进程数,默认 4 + timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_RENDER_TIMEOUT 读取,若未设置则默认为 300 秒。 + threads (int): 进程数, 如果为 None,则从环境变量 MINERU_PDF_RENDER_THREADS 读取,若未设置则默认为 4. Raises: TimeoutError: 当转换超时时抛出 @@ -86,6 +88,9 @@ def load_images_from_pdf( else: if timeout is None: timeout = get_load_images_timeout() + if threads is None: + threads = get_load_images_threads() + end_page_id = get_end_page_id(end_page_id, len(pdf_doc)) # 计算总页数 @@ -108,11 +113,13 @@ def load_images_from_pdf( page_ranges.append((range_start, range_end)) - # logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}") + logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}") - with ProcessPoolExecutor(max_workers=actual_threads) as executor: + executor = ProcessPoolExecutor(max_workers=actual_threads) + try: # 提交所有任务 futures = [] + future_to_range = {} for range_start, range_end in page_ranges: future = executor.submit( _load_images_from_pdf_worker, @@ -122,27 +129,68 @@ def load_images_from_pdf( range_end, image_type, ) - futures.append((range_start, future)) + futures.append(future) + future_to_range[future] = range_start - try: - # 收集结果并按页码排序 - all_results = [] - for range_start, future in futures: - images_list = future.result(timeout=timeout) - all_results.append((range_start, images_list)) + # 使用 wait() 设置单一全局超时 + done, not_done = wait(futures, timeout=timeout, return_when=ALL_COMPLETED) - # 按起始页码排序并合并结果 - all_results.sort(key=lambda x: x[0]) - images_list = [] - for _, imgs in all_results: - images_list.extend(imgs) - - return images_list, pdf_doc - except FuturesTimeoutError: + # 检查是否有未完成的任务(超时情况) + if not_done: + # 超时:强制终止所有子进程 + _terminate_executor_processes(executor) pdf_doc.close() - executor.shutdown(wait=False, cancel_futures=True) raise TimeoutError(f"PDF to images conversion timeout after {timeout}s") + # 所有任务完成,收集结果 + all_results = [] + for future in futures: + range_start = future_to_range[future] + # 这里不需要 timeout,因为任务已完成 + images_list = future.result() + all_results.append((range_start, images_list)) + + # 按起始页码排序并合并结果 + all_results.sort(key=lambda x: x[0]) + images_list = [] + for _, imgs in all_results: + images_list.extend(imgs) + + return images_list, pdf_doc + + except Exception as e: + # 发生任何异常时,确保清理子进程 + _terminate_executor_processes(executor) + pdf_doc.close() + if isinstance(e, TimeoutError): + raise + raise + finally: + executor.shutdown(wait=False, cancel_futures=True) + + +def _terminate_executor_processes(executor): + """强制终止 ProcessPoolExecutor 中的所有子进程""" + if hasattr(executor, '_processes'): + for pid, process in executor._processes.items(): + if process.is_alive(): + try: + # 先发送 SIGTERM 允许优雅退出 + os.kill(pid, signal.SIGTERM) + except (ProcessLookupError, OSError): + pass + + # 给子进程一点时间响应 SIGTERM + time.sleep(0.1) + + # 对仍然存活的进程发送 SIGKILL 强制终止 + for pid, process in executor._processes.items(): + if process.is_alive(): + try: + os.kill(pid, signal.SIGKILL) + except (ProcessLookupError, OSError): + pass + def load_images_from_pdf_core( pdf_bytes: bytes,