mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
@@ -1,6 +1,6 @@
|
|||||||
# 基础镜像配置 vLLM 或 LMDeploy ,请根据实际需要选择其中一个,要求 amd64(x86-64) CPU + Cambricon MLU.
|
# 基础镜像配置 vLLM 或 LMDeploy ,请根据实际需要选择其中一个,要求 amd64(x86-64) CPU + Cambricon MLU.
|
||||||
# Base image containing the LMDEPLOY inference environment, requiring amd64(x86-64) CPU + Cambricon MLU.
|
# Base image containing the LMDEPLOY inference environment, requiring amd64(x86-64) CPU + Cambricon MLU.
|
||||||
FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/camb:qwen_vl2.5
|
FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/camb:qwen2.5_vl
|
||||||
ARG BACKEND=lmdeploy
|
ARG BACKEND=lmdeploy
|
||||||
# Base image containing the vLLM inference environment, requiring amd64(x86-64) CPU + Cambricon MLU.
|
# Base image containing the vLLM inference environment, requiring amd64(x86-64) CPU + Cambricon MLU.
|
||||||
# FROM crpi-vofi3w62lkohhxsp.cn-shanghai.personal.cr.aliyuncs.com/opendatalab-mineru/mlu:vllm0.8.3-torch2.6.0-torchmlu1.26.1-ubuntu22.04-py310
|
# FROM crpi-vofi3w62lkohhxsp.cn-shanghai.personal.cr.aliyuncs.com/opendatalab-mineru/mlu:vllm0.8.3-torch2.6.0-torchmlu1.26.1-ubuntu22.04-py310
|
||||||
|
|||||||
@@ -77,7 +77,8 @@ Here are the environment variables and their descriptions:
|
|||||||
- `MINERU_MODEL_SOURCE`:
|
- `MINERU_MODEL_SOURCE`:
|
||||||
* Used to specify model source
|
* Used to specify model source
|
||||||
* supports `huggingface/modelscope/local`
|
* supports `huggingface/modelscope/local`
|
||||||
* defaults to `huggingface`, can be switched to `modelscope` or local models through environment variables.
|
* Default is `huggingface`; you can switch via an environment variable to `modelscope` to use a domestic acceleration mirror, or switch to `local` to use a local model.
|
||||||
|
|
||||||
|
|
||||||
- `MINERU_TOOLS_CONFIG_JSON`:
|
- `MINERU_TOOLS_CONFIG_JSON`:
|
||||||
* Used to specify configuration file path
|
* Used to specify configuration file path
|
||||||
@@ -101,8 +102,14 @@ Here are the environment variables and their descriptions:
|
|||||||
* Default is `true`, can be set to `false` via environment variable to disable table merging functionality.
|
* Default is `true`, can be set to `false` via environment variable to disable table merging functionality.
|
||||||
|
|
||||||
- `MINERU_PDF_RENDER_TIMEOUT`:
|
- `MINERU_PDF_RENDER_TIMEOUT`:
|
||||||
* Used to set the timeout period (in seconds) for rendering PDF to images
|
* Used to set the timeout (in seconds) for rendering PDFs to images.
|
||||||
* Default is `300` seconds, can be set to other values via environment variable to adjust the image rendering timeout.
|
* Default is `300` seconds; you can set a different value via an environment variable to adjust the rendering timeout.
|
||||||
|
* Only effective on Linux and macOS systems.
|
||||||
|
|
||||||
|
- `MINERU_PDF_RENDER_THREADS`:
|
||||||
|
* Used to set the number of threads used when rendering PDFs to images.
|
||||||
|
* Default is `4`; you can set a different value via an environment variable to adjust the number of threads for image rendering.
|
||||||
|
* Only effective on Linux and macOS systems.
|
||||||
|
|
||||||
- `MINERU_INTRA_OP_NUM_THREADS`:
|
- `MINERU_INTRA_OP_NUM_THREADS`:
|
||||||
* Used to set the intra_op thread count for ONNX models, affects the computation speed of individual operators
|
* Used to set the intra_op thread count for ONNX models, affects the computation speed of individual operators
|
||||||
|
|||||||
@@ -175,4 +175,5 @@ docker run -u root --name mineru_docker --privileged=true \
|
|||||||
🔴: 不支持,无法运行,或精度存在较大差异
|
🔴: 不支持,无法运行,或精度存在较大差异
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
>NPU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[ASCEND_RT_VISIBLE_DEVICES](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850alpha001/maintenref/envvar/envref_07_0028.html)
|
> - NPU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[ASCEND_RT_VISIBLE_DEVICES](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/850alpha001/maintenref/envvar/envref_07_0028.html)
|
||||||
|
> - 在Ascend平台可以通过`npu-smi info`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。
|
||||||
@@ -36,24 +36,11 @@ docker run --name mineru_docker \
|
|||||||
--privileged \
|
--privileged \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
--network=host \
|
--network=host \
|
||||||
--cap-add SYS_PTRACE \
|
|
||||||
--device=/dev/mem \
|
|
||||||
--device=/dev/dri \
|
|
||||||
--device=/dev/infiniband \
|
|
||||||
--device=/dev/cambricon_ctl \
|
|
||||||
--device=/dev/cambricon_dev0 \
|
|
||||||
--device=/dev/cambricon_dev1 \
|
|
||||||
--device=/dev/cambricon_dev2 \
|
|
||||||
--device=/dev/cambricon_dev3 \
|
|
||||||
--device=/dev/cambricon_dev4 \
|
|
||||||
--device=/dev/cambricon_dev5 \
|
|
||||||
--device=/dev/cambricon_dev6 \
|
|
||||||
--device=/dev/cambricon_dev7 \
|
|
||||||
--group-add video \
|
|
||||||
--shm-size=400g \
|
--shm-size=400g \
|
||||||
--ulimit memlock=-1 \
|
--ulimit memlock=-1 \
|
||||||
--security-opt seccomp=unconfined \
|
-v /dev:/dev \
|
||||||
--security-opt apparmor=unconfined \
|
-v /lib/modules:/lib/modules:ro \
|
||||||
|
-v /usr/bin/cnmon:/usr/bin/cnmon \
|
||||||
-e MINERU_MODEL_SOURCE=local \
|
-e MINERU_MODEL_SOURCE=local \
|
||||||
-e MINERU_LMDEPLOY_DEVICE=camb \
|
-e MINERU_LMDEPLOY_DEVICE=camb \
|
||||||
-it mineru:mlu-lmdeploy-latest \
|
-it mineru:mlu-lmdeploy-latest \
|
||||||
@@ -86,7 +73,7 @@ docker run --name mineru_docker \
|
|||||||
不同环境下,MinerU对Cambricon加速卡的支持情况如下表所示:
|
不同环境下,MinerU对Cambricon加速卡的支持情况如下表所示:
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
> - `lmdeploy`黄灯问题为不能批量输出文件夹,单文件输入正常
|
> - `lmdeploy`黄灯问题为不能输入文件夹使用批量解析功能,输入单个文件时表现正常。
|
||||||
> - `vllm`黄灯问题为在精度未对齐,在部分场景下可能出现预期外结果。
|
> - `vllm`黄灯问题为在精度未对齐,在部分场景下可能出现预期外结果。
|
||||||
|
|
||||||
<table border="1">
|
<table border="1">
|
||||||
@@ -168,5 +155,6 @@ docker run --name mineru_docker \
|
|||||||
🔴: 不支持,无法运行,或精度存在较大差异
|
🔴: 不支持,无法运行,或精度存在较大差异
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
>Cambricon加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明,
|
> - Cambricon加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明,
|
||||||
>将环境变量`CUDA_VISIBLE_DEVICES`替换为`MLU_VISIBLE_DEVICES`即可。
|
>将环境变量`CUDA_VISIBLE_DEVICES`替换为`MLU_VISIBLE_DEVICES`即可。
|
||||||
|
> - 在Cambricon平台可以通过`cnmon`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。
|
||||||
@@ -105,5 +105,6 @@ docker run -u root --name mineru_docker \
|
|||||||
🔴: 不支持,无法运行,或精度存在较大差异
|
🔴: 不支持,无法运行,或精度存在较大差异
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
>GCU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明,
|
> - GCU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明,
|
||||||
>将环境变量`CUDA_VISIBLE_DEVICES`替换为`TOPS_VISIBLE_DEVICES`即可。
|
>将环境变量`CUDA_VISIBLE_DEVICES`替换为`TOPS_VISIBLE_DEVICES`即可。
|
||||||
|
> - 在Enflame平台可以通过`efsmi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
以下为本指南测试使用的平台信息,供参考:
|
以下为本指南测试使用的平台信息,供参考:
|
||||||
```
|
```
|
||||||
os: Ubuntu 22.04.3 LTS
|
os: Ubuntu 22.04.3 LTS
|
||||||
cpu: Hygon Hygon C86-4G(x86-64)
|
cpu: Hygon C86-4G(x86-64)
|
||||||
dcu: BW200
|
dcu: BW200
|
||||||
driver: 6.3.13-V1.12.0a
|
driver: 6.3.13-V1.12.0a
|
||||||
docker: 20.10.24
|
docker: 20.10.24
|
||||||
@@ -112,4 +112,5 @@ docker run -u root --name mineru_docker \
|
|||||||
🔴: 不支持,无法运行,或精度存在较大差异
|
🔴: 不支持,无法运行,或精度存在较大差异
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
>DCU加速卡指定可用加速卡的方式与AMD GPU类似,请参考[GPU isolation techniques](https://rocm.docs.amd.com/en/docs-6.2.4/conceptual/gpu-isolation.html)
|
> - DCU加速卡指定可用加速卡的方式与AMD GPU类似,请参考[GPU isolation techniques](https://rocm.docs.amd.com/en/docs-6.2.4/conceptual/gpu-isolation.html)
|
||||||
|
> - 在Hygon平台可以通过`hy-smi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。
|
||||||
@@ -119,4 +119,5 @@ docker run --name mineru_docker \
|
|||||||
🔴: 不支持,无法运行,或精度存在较大差异
|
🔴: 不支持,无法运行,或精度存在较大差异
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
>Iluvatar加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明
|
> - Iluvatar加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明
|
||||||
|
> - 在Iluvatar平台可以通过`ixsmi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。
|
||||||
@@ -148,4 +148,5 @@ docker run --ipc host \
|
|||||||
🔴: 不支持,无法运行,或精度存在较大差异
|
🔴: 不支持,无法运行,或精度存在较大差异
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
>MACA加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明。
|
> - MACA加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明。
|
||||||
|
> - 在METAX平台可以通过`mx-smi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。
|
||||||
@@ -112,4 +112,5 @@ docker run -u root --name mineru_docker \
|
|||||||
🔴: 不支持,无法运行,或精度存在较大差异
|
🔴: 不支持,无法运行,或精度存在较大差异
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
>MooreThreads加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[GPU 枚举](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/install_guide/#gpu-%E6%9E%9A%E4%B8%BE)
|
> - MooreThreads加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[GPU 枚举](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/install_guide/#gpu-%E6%9E%9A%E4%B8%BE)
|
||||||
|
> - 在MooreThreads平台可以通过`mthreads-gmi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。
|
||||||
@@ -139,4 +139,5 @@ docker run --privileged=true \
|
|||||||
🔴: 不支持,无法运行,或精度存在较大差异
|
🔴: 不支持,无法运行,或精度存在较大差异
|
||||||
|
|
||||||
>[!TIP]
|
>[!TIP]
|
||||||
>PPU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明。
|
> - PPU加速卡指定可用加速卡的方式与NVIDIA GPU类似,请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明。
|
||||||
|
> - 在T-Head平台可以通过`ppu-smi`命令查看加速卡的使用情况,并根据需要指定空闲的加速卡ID以避免资源冲突。
|
||||||
@@ -72,7 +72,7 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置,
|
|||||||
- `MINERU_MODEL_SOURCE`:
|
- `MINERU_MODEL_SOURCE`:
|
||||||
* 用于指定模型来源
|
* 用于指定模型来源
|
||||||
* 支持`huggingface/modelscope/local`
|
* 支持`huggingface/modelscope/local`
|
||||||
* 默认为`huggingface`可通过环境变量切换为`modelscope`或使用本地模型。
|
* 默认为`huggingface`可通过环境变量切换为`modelscope`使用国内加速源或切换至`local`以使用本地模型。
|
||||||
|
|
||||||
- `MINERU_TOOLS_CONFIG_JSON`:
|
- `MINERU_TOOLS_CONFIG_JSON`:
|
||||||
* 用于指定配置文件路径
|
* 用于指定配置文件路径
|
||||||
@@ -98,6 +98,12 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置,
|
|||||||
- `MINERU_PDF_RENDER_TIMEOUT`:
|
- `MINERU_PDF_RENDER_TIMEOUT`:
|
||||||
* 用于设置将PDF渲染为图片的超时时间(秒)
|
* 用于设置将PDF渲染为图片的超时时间(秒)
|
||||||
* 默认为`300`秒,可通过环境变量设置为其他值以调整渲染图片的超时时间。
|
* 默认为`300`秒,可通过环境变量设置为其他值以调整渲染图片的超时时间。
|
||||||
|
* 仅在linux和macOS系统中生效。
|
||||||
|
|
||||||
|
- `MINERU_PDF_RENDER_THREADS`:
|
||||||
|
* 用于设置将PDF渲染为图片时使用的线程数
|
||||||
|
* 默认为`4`,可通过环境变量设置为其他值以调整渲染图片时的线程数。
|
||||||
|
* 仅在linux和macOS系统中生效。
|
||||||
|
|
||||||
- `MINERU_INTRA_OP_NUM_THREADS`:
|
- `MINERU_INTRA_OP_NUM_THREADS`:
|
||||||
* 用于设置onnx模型的intra_op线程数,影响单个算子的计算速度
|
* 用于设置onnx模型的intra_op线程数,影响单个算子的计算速度
|
||||||
|
|||||||
@@ -11,6 +11,11 @@ def get_load_images_timeout() -> int:
|
|||||||
return get_value_from_string(env_value, 300)
|
return get_value_from_string(env_value, 300)
|
||||||
|
|
||||||
|
|
||||||
|
def get_load_images_threads() -> int:
|
||||||
|
env_value = os.getenv('MINERU_PDF_RENDER_THREADS', None)
|
||||||
|
return get_value_from_string(env_value, 4)
|
||||||
|
|
||||||
|
|
||||||
def get_value_from_string(env_value: str, default_value: int) -> int:
|
def get_value_from_string(env_value: str, default_value: int) -> int:
|
||||||
if env_value is not None:
|
if env_value is not None:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
# Copyright (c) Opendatalab. All rights reserved.
|
# Copyright (c) Opendatalab. All rights reserved.
|
||||||
import os
|
import os
|
||||||
|
import signal
|
||||||
|
import time
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -9,13 +11,13 @@ from PIL import Image, ImageOps
|
|||||||
|
|
||||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||||
from mineru.utils.check_sys_env import is_windows_environment
|
from mineru.utils.check_sys_env import is_windows_environment
|
||||||
from mineru.utils.os_env_config import get_load_images_timeout
|
from mineru.utils.os_env_config import get_load_images_timeout, get_load_images_threads
|
||||||
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
||||||
from mineru.utils.enum_class import ImageType
|
from mineru.utils.enum_class import ImageType
|
||||||
from mineru.utils.hash_utils import str_sha256
|
from mineru.utils.hash_utils import str_sha256
|
||||||
from mineru.utils.pdf_page_id import get_end_page_id
|
from mineru.utils.pdf_page_id import get_end_page_id
|
||||||
|
|
||||||
from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
|
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
|
||||||
|
|
||||||
|
|
||||||
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
|
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
|
||||||
@@ -57,7 +59,7 @@ def load_images_from_pdf(
|
|||||||
end_page_id=None,
|
end_page_id=None,
|
||||||
image_type=ImageType.PIL,
|
image_type=ImageType.PIL,
|
||||||
timeout=None,
|
timeout=None,
|
||||||
threads=4,
|
threads=None,
|
||||||
):
|
):
|
||||||
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
||||||
|
|
||||||
@@ -67,8 +69,8 @@ def load_images_from_pdf(
|
|||||||
start_page_id (int, optional): 起始页码. Defaults to 0.
|
start_page_id (int, optional): 起始页码. Defaults to 0.
|
||||||
end_page_id (int | None, optional): 结束页码. Defaults to None.
|
end_page_id (int | None, optional): 结束页码. Defaults to None.
|
||||||
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
|
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
|
||||||
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。
|
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_RENDER_TIMEOUT 读取,若未设置则默认为 300 秒。
|
||||||
threads (int): 进程数,默认 4
|
threads (int): 进程数, 如果为 None,则从环境变量 MINERU_PDF_RENDER_THREADS 读取,若未设置则默认为 4.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
TimeoutError: 当转换超时时抛出
|
TimeoutError: 当转换超时时抛出
|
||||||
@@ -86,6 +88,9 @@ def load_images_from_pdf(
|
|||||||
else:
|
else:
|
||||||
if timeout is None:
|
if timeout is None:
|
||||||
timeout = get_load_images_timeout()
|
timeout = get_load_images_timeout()
|
||||||
|
if threads is None:
|
||||||
|
threads = get_load_images_threads()
|
||||||
|
|
||||||
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
||||||
|
|
||||||
# 计算总页数
|
# 计算总页数
|
||||||
@@ -108,11 +113,13 @@ def load_images_from_pdf(
|
|||||||
|
|
||||||
page_ranges.append((range_start, range_end))
|
page_ranges.append((range_start, range_end))
|
||||||
|
|
||||||
# logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
|
logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
|
||||||
|
|
||||||
with ProcessPoolExecutor(max_workers=actual_threads) as executor:
|
executor = ProcessPoolExecutor(max_workers=actual_threads)
|
||||||
|
try:
|
||||||
# 提交所有任务
|
# 提交所有任务
|
||||||
futures = []
|
futures = []
|
||||||
|
future_to_range = {}
|
||||||
for range_start, range_end in page_ranges:
|
for range_start, range_end in page_ranges:
|
||||||
future = executor.submit(
|
future = executor.submit(
|
||||||
_load_images_from_pdf_worker,
|
_load_images_from_pdf_worker,
|
||||||
@@ -122,27 +129,68 @@ def load_images_from_pdf(
|
|||||||
range_end,
|
range_end,
|
||||||
image_type,
|
image_type,
|
||||||
)
|
)
|
||||||
futures.append((range_start, future))
|
futures.append(future)
|
||||||
|
future_to_range[future] = range_start
|
||||||
|
|
||||||
try:
|
# 使用 wait() 设置单一全局超时
|
||||||
# 收集结果并按页码排序
|
done, not_done = wait(futures, timeout=timeout, return_when=ALL_COMPLETED)
|
||||||
all_results = []
|
|
||||||
for range_start, future in futures:
|
|
||||||
images_list = future.result(timeout=timeout)
|
|
||||||
all_results.append((range_start, images_list))
|
|
||||||
|
|
||||||
# 按起始页码排序并合并结果
|
# 检查是否有未完成的任务(超时情况)
|
||||||
all_results.sort(key=lambda x: x[0])
|
if not_done:
|
||||||
images_list = []
|
# 超时:强制终止所有子进程
|
||||||
for _, imgs in all_results:
|
_terminate_executor_processes(executor)
|
||||||
images_list.extend(imgs)
|
|
||||||
|
|
||||||
return images_list, pdf_doc
|
|
||||||
except FuturesTimeoutError:
|
|
||||||
pdf_doc.close()
|
pdf_doc.close()
|
||||||
executor.shutdown(wait=False, cancel_futures=True)
|
|
||||||
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
||||||
|
|
||||||
|
# 所有任务完成,收集结果
|
||||||
|
all_results = []
|
||||||
|
for future in futures:
|
||||||
|
range_start = future_to_range[future]
|
||||||
|
# 这里不需要 timeout,因为任务已完成
|
||||||
|
images_list = future.result()
|
||||||
|
all_results.append((range_start, images_list))
|
||||||
|
|
||||||
|
# 按起始页码排序并合并结果
|
||||||
|
all_results.sort(key=lambda x: x[0])
|
||||||
|
images_list = []
|
||||||
|
for _, imgs in all_results:
|
||||||
|
images_list.extend(imgs)
|
||||||
|
|
||||||
|
return images_list, pdf_doc
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# 发生任何异常时,确保清理子进程
|
||||||
|
_terminate_executor_processes(executor)
|
||||||
|
pdf_doc.close()
|
||||||
|
if isinstance(e, TimeoutError):
|
||||||
|
raise
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
executor.shutdown(wait=False, cancel_futures=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _terminate_executor_processes(executor):
|
||||||
|
"""强制终止 ProcessPoolExecutor 中的所有子进程"""
|
||||||
|
if hasattr(executor, '_processes'):
|
||||||
|
for pid, process in executor._processes.items():
|
||||||
|
if process.is_alive():
|
||||||
|
try:
|
||||||
|
# 先发送 SIGTERM 允许优雅退出
|
||||||
|
os.kill(pid, signal.SIGTERM)
|
||||||
|
except (ProcessLookupError, OSError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 给子进程一点时间响应 SIGTERM
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
# 对仍然存活的进程发送 SIGKILL 强制终止
|
||||||
|
for pid, process in executor._processes.items():
|
||||||
|
if process.is_alive():
|
||||||
|
try:
|
||||||
|
os.kill(pid, signal.SIGKILL)
|
||||||
|
except (ProcessLookupError, OSError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def load_images_from_pdf_core(
|
def load_images_from_pdf_core(
|
||||||
pdf_bytes: bytes,
|
pdf_bytes: bytes,
|
||||||
|
|||||||
Reference in New Issue
Block a user