Compare commits

..

1 Commits

Author SHA1 Message Date
Xiaomeng Zhao
a2dfab8ee9 Merge pull request #3903 from opendatalab/dev
Dev
2025-10-31 15:00:28 +08:00
16 changed files with 32 additions and 216 deletions

View File

@@ -44,10 +44,6 @@
</div>
# Changelog
- 2025/11/04 2.6.4 Release
- Added timeout configuration for PDF image rendering, default is 300 seconds, can be configured via environment variable `MINERU_PDF_RENDER_TIMEOUT` to prevent long blocking of the rendering process caused by some abnormal PDF files.
- Added CPU thread count configuration options for ONNX models, default is the system CPU core count, can be configured via environment variables `MINERU_INTRA_OP_NUM_THREADS` and `MINERU_INTER_OP_NUM_THREADS` to reduce CPU resource contention conflicts in high concurrency scenarios.
- 2025/10/31 2.6.3 Release
- Added support for a new backend `vlm-mlx-engine`, enabling MLX-accelerated inference for the MinerU2.5 model on Apple Silicon devices. Compared to the `vlm-transformers` backend, `vlm-mlx-engine` delivers a 100%200% speed improvement.
- Bug fixes: #3849, #3859

View File

@@ -44,10 +44,6 @@
</div>
# 更新记录
- 2025/11/04 2.6.4 发布
- 为pdf渲染图片增加超时配置默认为300秒可通过环境变量`MINERU_PDF_RENDER_TIMEOUT`进行配置防止部分异常pdf文件导致渲染过程长时间阻塞。
- 为onnx模型增加cpu线程数配置选项默认为系统cpu核心数可通过环境变量`MINERU_INTRA_OP_NUM_THREADS``MINERU_INTER_OP_NUM_THREADS`进行配置以减少高并发场景下的对cpu资源的抢占冲突。
- 2025/10/31 2.6.3 发布
- 增加新后端`vlm-mlx-engine`支持在Apple Silicon设备上支持使用`MLX`加速`MinerU2.5`模型推理,相比`vlm-transformers`后端,`vlm-mlx-engine`后端速度提升100%~200%。
- bug修复: #3849 #3859

View File

@@ -100,14 +100,3 @@ Here are the environment variables and their descriptions:
* Used to enable table merging functionality
* Default is `true`, can be set to `false` via environment variable to disable table merging functionality.
- `MINERU_PDF_RENDER_TIMEOUT`:
* Used to set the timeout period (in seconds) for rendering PDF to images
* Default is `300` seconds, can be set to other values via environment variable to adjust the image rendering timeout.
- `MINERU_INTRA_OP_NUM_THREADS`:
* Used to set the intra_op thread count for ONNX models, affects the computation speed of individual operators
* Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count.
- `MINERU_INTER_OP_NUM_THREADS`:
* Used to set the inter_op thread count for ONNX models, affects the parallel execution of multiple operators
* Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count.

View File

@@ -94,15 +94,3 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置
- `MINERU_TABLE_MERGE_ENABLE`
* 用于启用表格合并功能
* 默认为`true`,可通过环境变量设置为`false`来禁用表格合并功能。
- `MINERU_PDF_RENDER_TIMEOUT`
* 用于设置将PDF渲染为图片的超时时间
* 默认为`300`秒,可通过环境变量设置为其他值以调整渲染图片的超时时间。
- `MINERU_INTRA_OP_NUM_THREADS`
* 用于设置onnx模型的intra_op线程数影响单个算子的计算速度
* 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。
- `MINERU_INTER_OP_NUM_THREADS`
* 用于设置onnx模型的inter_op线程数影响多个算子的并行执行
* 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。

View File

@@ -99,10 +99,7 @@ def doc_analyze(
_lang = lang_list[pdf_idx]
# 收集每个数据集中的页面
# load_images_start = time.time()
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
# load_images_time = round(time.time() - load_images_start, 2)
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
all_image_lists.append(images_list)
all_pdf_docs.append(pdf_doc)
for page_idx in range(len(images_list)):

View File

@@ -8,7 +8,7 @@ from .utils import enable_custom_logits_processors, set_default_gpu_memory_utili
from .model_output_to_middle_json import result_to_middle_json
from ...data.data_reader_writer import DataWriter
from mineru.utils.pdf_image_tools import load_images_from_pdf
from ...utils.check_sys_env import is_mac_os_version_supported
from ...utils.check_mac_env import is_mac_os_version_supported
from ...utils.config_reader import get_device
from ...utils.enum_class import ImageType
@@ -177,7 +177,7 @@ async def aio_doc_analyze(
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
# load_images_time = round(time.time() - load_images_start, 2)
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
# logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
# infer_start = time.time()
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)

View File

@@ -4,7 +4,7 @@ import click
from pathlib import Path
from loguru import logger
from mineru.utils.check_sys_env import is_mac_os_version_supported
from mineru.utils.check_mac_env import is_mac_os_version_supported
from mineru.utils.cli_parser import arg_parse
from mineru.utils.config_reader import get_device
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path

View File

@@ -5,8 +5,8 @@ import os
import copy
from pathlib import Path
from loguru import logger
import pypdfium2 as pdfium
from loguru import logger
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
@@ -16,12 +16,10 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
from mineru.utils.pdf_page_id import get_end_page_id
pdf_suffixes = ["pdf"]
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def read_fn(path):
if not isinstance(path, Path):
@@ -46,10 +44,18 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
pdf = pdfium.PdfDocument(pdf_bytes)
output_pdf = pdfium.PdfDocument.new()
try:
end_page_id = get_end_page_id(end_page_id, len(pdf))
# 从字节数据加载PDF
pdf = pdfium.PdfDocument(pdf_bytes)
# 确定结束页
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
if end_page_id > len(pdf) - 1:
logger.warning("end_page_id is out of range, use pdf_docs length")
end_page_id = len(pdf) - 1
# 创建一个新的PDF文档
output_pdf = pdfium.PdfDocument.new()
# 选择要导入的页面索引
page_indices = list(range(start_page_id, end_page_id + 1))
@@ -63,12 +69,13 @@ def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page
# 获取字节数据
output_bytes = output_buffer.getvalue()
pdf.close() # 关闭原PDF文档以释放资源
output_pdf.close() # 关闭新PDF文档以释放资源
except Exception as e:
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
output_bytes = pdf_bytes
pdf.close()
output_pdf.close()
return output_bytes

View File

@@ -13,7 +13,7 @@ from gradio_pdf import PDF
from loguru import logger
from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
from mineru.utils.check_sys_env import is_mac_os_version_supported
from mineru.utils.check_mac_env import is_mac_os_version_supported
from mineru.utils.cli_parser import arg_parse
from mineru.utils.hash_utils import str_sha256

View File

@@ -16,7 +16,6 @@ from typing import Any, Dict, List, Tuple
import numpy as np
from mineru.utils.os_env_config import get_op_num_threads
from .table_structure_utils import (
OrtInferSession,
TableLabelDecode,
@@ -30,9 +29,6 @@ class TableStructurer:
self.preprocess_op = TablePreprocess()
self.batch_preprocess_op = BatchTablePreprocess()
config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
self.session = OrtInferSession(config)
self.character = self.session.get_metadata()

View File

@@ -5,8 +5,6 @@ from typing import Optional, Dict, Any, Tuple
import cv2
import numpy as np
from skimage import measure
from mineru.utils.os_env_config import get_op_num_threads
from .utils import OrtInferSession, resize_img
from .utils_table_line_rec import (
get_table_line,
@@ -30,9 +28,6 @@ class TSRUnet:
self.inp_height = 1024
self.inp_width = 1024
config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
self.session = OrtInferSession(config)
def __call__(

View File

@@ -4,10 +4,6 @@ import platform
from packaging import version
def is_windows_environment() -> bool:
return platform.system() == "Windows"
# Detect if the current environment is a Mac computer
def is_mac_environment() -> bool:
return platform.system() == "Darwin"

View File

@@ -1,30 +0,0 @@
import os
def get_op_num_threads(env_name: str) -> int:
env_value = os.getenv(env_name, None)
return get_value_from_string(env_value, -1)
def get_load_images_timeout() -> int:
env_value = os.getenv('MINERU_PDF_RENDER_TIMEOUT', None)
return get_value_from_string(env_value, 300)
def get_value_from_string(env_value: str, default_value: int) -> int:
if env_value is not None:
try:
num_threads = int(env_value)
if num_threads > 0:
return num_threads
except ValueError:
return default_value
return default_value
if __name__ == '__main__':
print(get_value_from_string('1', -1))
print(get_value_from_string('0', -1))
print(get_value_from_string('-1', -1))
print(get_value_from_string('abc', -1))
print(get_load_images_timeout())

View File

@@ -1,5 +1,4 @@
# Copyright (c) Opendatalab. All rights reserved.
import os
from io import BytesIO
import numpy as np
@@ -8,14 +7,9 @@ from loguru import logger
from PIL import Image
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.check_sys_env import is_windows_environment
from mineru.utils.os_env_config import get_load_images_timeout
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
from mineru.utils.enum_class import ImageType
from mineru.utils.hash_utils import str_sha256
from mineru.utils.pdf_page_id import get_end_page_id
from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
from .enum_class import ImageType
from .hash_utils import str_sha256
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
@@ -41,106 +35,7 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
return image_dict
def _load_images_from_pdf_worker(pdf_bytes, dpi, start_page_id, end_page_id, image_type):
"""用于进程池的包装函数"""
return load_images_from_pdf_core(pdf_bytes, dpi, start_page_id, end_page_id, image_type)
def load_images_from_pdf(
pdf_bytes: bytes,
dpi=200,
start_page_id=0,
end_page_id=None,
image_type=ImageType.PIL,
timeout=None,
threads=4,
):
"""带超时控制的 PDF 转图片函数,支持多进程加速
Args:
pdf_bytes (bytes): PDF 文件的 bytes
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
start_page_id (int, optional): 起始页码. Defaults to 0.
end_page_id (int | None, optional): 结束页码. Defaults to None.
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
timeout (int | None, optional): 超时时间(秒)。如果为 None则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。
threads (int): 进程数,默认 4
Raises:
TimeoutError: 当转换超时时抛出
"""
pdf_doc = pdfium.PdfDocument(pdf_bytes)
if is_windows_environment():
# Windows 环境下不使用多进程
return load_images_from_pdf_core(
pdf_bytes,
dpi,
start_page_id,
get_end_page_id(end_page_id, len(pdf_doc)),
image_type
), pdf_doc
else:
if timeout is None:
timeout = get_load_images_timeout()
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
# 计算总页数
total_pages = end_page_id - start_page_id + 1
# 实际使用的进程数不超过总页数
actual_threads = min(os.cpu_count() or 1, threads, total_pages)
# 根据实际进程数分组页面范围
pages_per_thread = max(1, total_pages // actual_threads)
page_ranges = []
for i in range(actual_threads):
range_start = start_page_id + i * pages_per_thread
if i == actual_threads - 1:
# 最后一个进程处理剩余所有页面
range_end = end_page_id
else:
range_end = start_page_id + (i + 1) * pages_per_thread - 1
page_ranges.append((range_start, range_end))
# logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
with ProcessPoolExecutor(max_workers=actual_threads) as executor:
# 提交所有任务
futures = []
for range_start, range_end in page_ranges:
future = executor.submit(
_load_images_from_pdf_worker,
pdf_bytes,
dpi,
range_start,
range_end,
image_type
)
futures.append((range_start, future))
try:
# 收集结果并按页码排序
all_results = []
for range_start, future in futures:
images_list = future.result(timeout=timeout)
all_results.append((range_start, images_list))
# 按起始页码排序并合并结果
all_results.sort(key=lambda x: x[0])
images_list = []
for _, imgs in all_results:
images_list.extend(imgs)
return images_list, pdf_doc
except FuturesTimeoutError:
pdf_doc.close()
executor.shutdown(wait=False, cancel_futures=True)
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
def load_images_from_pdf_core(
pdf_bytes: bytes,
dpi=200,
start_page_id=0,
@@ -150,17 +45,18 @@ def load_images_from_pdf_core(
images_list = []
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_page_num = len(pdf_doc)
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
if end_page_id > pdf_page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = pdf_page_num - 1
for index in range(start_page_id, end_page_id + 1):
# logger.debug(f"Converting page {index}/{pdf_page_num} to image")
page = pdf_doc[index]
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
images_list.append(image_dict)
for index in range(0, pdf_page_num):
if start_page_id <= index <= end_page_id:
page = pdf_doc[index]
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
images_list.append(image_dict)
pdf_doc.close()
return images_list
return images_list, pdf_doc
def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):

View File

@@ -1,10 +0,0 @@
# Copyright (c) Opendatalab. All rights reserved.
from loguru import logger
def get_end_page_id(end_page_id, pdf_page_num):
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
if end_page_id > pdf_page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = pdf_page_num - 1
return end_page_id

View File

@@ -1 +1 @@
__version__ = "2.6.3"
__version__ = "2.6.2"