mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-04-05 23:48:36 +07:00
Compare commits
35 Commits
release-2.
...
release-2.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f11e609a14 | ||
|
|
e010b0974a | ||
|
|
fe1549960d | ||
|
|
df23e45861 | ||
|
|
5ec07ee7ab | ||
|
|
f1ebf5a7f0 | ||
|
|
dae2cc8514 | ||
|
|
5de8f1a19f | ||
|
|
be2369bdd4 | ||
|
|
51df4d8508 | ||
|
|
f7225d8e17 | ||
|
|
a9c9501af6 | ||
|
|
74de2725cb | ||
|
|
6250c453d9 | ||
|
|
54417a51f8 | ||
|
|
2f120db20e | ||
|
|
2079395774 | ||
|
|
b4c57116c1 | ||
|
|
ace7f76869 | ||
|
|
5349fd7ccd | ||
|
|
5999f6664f | ||
|
|
245ae28c27 | ||
|
|
4afa045545 | ||
|
|
c32ff88400 | ||
|
|
4214634de8 | ||
|
|
bffc6aff53 | ||
|
|
05e114f8b9 | ||
|
|
66d5f3dfd2 | ||
|
|
305e3a61e8 | ||
|
|
b614bef035 | ||
|
|
cce16daf1f | ||
|
|
94eb35ffda | ||
|
|
1ebc1ae841 | ||
|
|
374ace0a34 | ||
|
|
a33715c015 |
@@ -44,6 +44,10 @@
|
||||
</div>
|
||||
|
||||
# Changelog
|
||||
- 2025/11/04 2.6.4 Release
|
||||
- Added timeout configuration for PDF image rendering, default is 300 seconds, can be configured via environment variable `MINERU_PDF_RENDER_TIMEOUT` to prevent long blocking of the rendering process caused by some abnormal PDF files.
|
||||
- Added CPU thread count configuration options for ONNX models, default is the system CPU core count, can be configured via environment variables `MINERU_INTRA_OP_NUM_THREADS` and `MINERU_INTER_OP_NUM_THREADS` to reduce CPU resource contention conflicts in high concurrency scenarios.
|
||||
|
||||
- 2025/10/31 2.6.3 Release
|
||||
- Added support for a new backend `vlm-mlx-engine`, enabling MLX-accelerated inference for the MinerU2.5 model on Apple Silicon devices. Compared to the `vlm-transformers` backend, `vlm-mlx-engine` delivers a 100%–200% speed improvement.
|
||||
- Bug fixes: #3849, #3859
|
||||
|
||||
@@ -44,6 +44,10 @@
|
||||
</div>
|
||||
|
||||
# 更新记录
|
||||
- 2025/11/04 2.6.4 发布
|
||||
- 为pdf渲染图片增加超时配置,默认为300秒,可通过环境变量`MINERU_PDF_RENDER_TIMEOUT`进行配置,防止部分异常pdf文件导致渲染过程长时间阻塞。
|
||||
- 为onnx模型增加cpu线程数配置选项,默认为系统cpu核心数,可通过环境变量`MINERU_INTRA_OP_NUM_THREADS`和`MINERU_INTER_OP_NUM_THREADS`进行配置,以减少高并发场景下的对cpu资源的抢占冲突。
|
||||
|
||||
- 2025/10/31 2.6.3 发布
|
||||
- 增加新后端`vlm-mlx-engine`支持,在Apple Silicon设备上支持使用`MLX`加速`MinerU2.5`模型推理,相比`vlm-transformers`后端,`vlm-mlx-engine`后端速度提升100%~200%。
|
||||
- bug修复: #3849 #3859
|
||||
|
||||
@@ -100,3 +100,14 @@ Here are the environment variables and their descriptions:
|
||||
* Used to enable table merging functionality
|
||||
* Default is `true`, can be set to `false` via environment variable to disable table merging functionality.
|
||||
|
||||
- `MINERU_PDF_RENDER_TIMEOUT`:
|
||||
* Used to set the timeout period (in seconds) for rendering PDF to images
|
||||
* Default is `300` seconds, can be set to other values via environment variable to adjust the image rendering timeout.
|
||||
|
||||
- `MINERU_INTRA_OP_NUM_THREADS`:
|
||||
* Used to set the intra_op thread count for ONNX models, affects the computation speed of individual operators
|
||||
* Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count.
|
||||
|
||||
- `MINERU_INTER_OP_NUM_THREADS`:
|
||||
* Used to set the inter_op thread count for ONNX models, affects the parallel execution of multiple operators
|
||||
* Default is `-1` (auto-select), can be set to other values via environment variable to adjust the thread count.
|
||||
|
||||
@@ -94,3 +94,15 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置,
|
||||
- `MINERU_TABLE_MERGE_ENABLE`:
|
||||
* 用于启用表格合并功能
|
||||
* 默认为`true`,可通过环境变量设置为`false`来禁用表格合并功能。
|
||||
|
||||
- `MINERU_PDF_RENDER_TIMEOUT`:
|
||||
* 用于设置将PDF渲染为图片的超时时间(秒)
|
||||
* 默认为`300`秒,可通过环境变量设置为其他值以调整渲染图片的超时时间。
|
||||
|
||||
- `MINERU_INTRA_OP_NUM_THREADS`:
|
||||
* 用于设置onnx模型的intra_op线程数,影响单个算子的计算速度
|
||||
* 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。
|
||||
|
||||
- `MINERU_INTER_OP_NUM_THREADS`:
|
||||
* 用于设置onnx模型的inter_op线程数,影响多个算子的并行执行
|
||||
* 默认为`-1`(自动选择),可通过环境变量设置为其他值以调整线程数。
|
||||
|
||||
@@ -99,7 +99,10 @@ def doc_analyze(
|
||||
_lang = lang_list[pdf_idx]
|
||||
|
||||
# 收集每个数据集中的页面
|
||||
# load_images_start = time.time()
|
||||
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
||||
# load_images_time = round(time.time() - load_images_start, 2)
|
||||
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
|
||||
all_image_lists.append(images_list)
|
||||
all_pdf_docs.append(pdf_doc)
|
||||
for page_idx in range(len(images_list)):
|
||||
|
||||
@@ -8,7 +8,7 @@ from .utils import enable_custom_logits_processors, set_default_gpu_memory_utili
|
||||
from .model_output_to_middle_json import result_to_middle_json
|
||||
from ...data.data_reader_writer import DataWriter
|
||||
from mineru.utils.pdf_image_tools import load_images_from_pdf
|
||||
from ...utils.check_mac_env import is_mac_os_version_supported
|
||||
from ...utils.check_sys_env import is_mac_os_version_supported
|
||||
from ...utils.config_reader import get_device
|
||||
|
||||
from ...utils.enum_class import ImageType
|
||||
@@ -177,7 +177,7 @@ async def aio_doc_analyze(
|
||||
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
||||
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
||||
# load_images_time = round(time.time() - load_images_start, 2)
|
||||
# logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
|
||||
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
||||
|
||||
# infer_start = time.time()
|
||||
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
|
||||
|
||||
@@ -4,7 +4,7 @@ import click
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
from mineru.utils.check_mac_env import is_mac_os_version_supported
|
||||
from mineru.utils.check_sys_env import is_mac_os_version_supported
|
||||
from mineru.utils.cli_parser import arg_parse
|
||||
from mineru.utils.config_reader import get_device
|
||||
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
||||
|
||||
@@ -5,8 +5,8 @@ import os
|
||||
import copy
|
||||
from pathlib import Path
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from loguru import logger
|
||||
import pypdfium2 as pdfium
|
||||
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
|
||||
@@ -16,10 +16,12 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
||||
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
||||
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
||||
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
|
||||
pdf_suffixes = ["pdf"]
|
||||
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
def read_fn(path):
|
||||
if not isinstance(path, Path):
|
||||
@@ -44,18 +46,10 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
|
||||
|
||||
|
||||
def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
|
||||
pdf = pdfium.PdfDocument(pdf_bytes)
|
||||
output_pdf = pdfium.PdfDocument.new()
|
||||
try:
|
||||
# 从字节数据加载PDF
|
||||
pdf = pdfium.PdfDocument(pdf_bytes)
|
||||
|
||||
# 确定结束页
|
||||
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
|
||||
if end_page_id > len(pdf) - 1:
|
||||
logger.warning("end_page_id is out of range, use pdf_docs length")
|
||||
end_page_id = len(pdf) - 1
|
||||
|
||||
# 创建一个新的PDF文档
|
||||
output_pdf = pdfium.PdfDocument.new()
|
||||
end_page_id = get_end_page_id(end_page_id, len(pdf))
|
||||
|
||||
# 选择要导入的页面索引
|
||||
page_indices = list(range(start_page_id, end_page_id + 1))
|
||||
@@ -69,13 +63,12 @@ def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page
|
||||
|
||||
# 获取字节数据
|
||||
output_bytes = output_buffer.getvalue()
|
||||
|
||||
pdf.close() # 关闭原PDF文档以释放资源
|
||||
output_pdf.close() # 关闭新PDF文档以释放资源
|
||||
except Exception as e:
|
||||
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
|
||||
output_bytes = pdf_bytes
|
||||
|
||||
pdf.close()
|
||||
output_pdf.close()
|
||||
return output_bytes
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from gradio_pdf import PDF
|
||||
from loguru import logger
|
||||
|
||||
from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
|
||||
from mineru.utils.check_mac_env import is_mac_os_version_supported
|
||||
from mineru.utils.check_sys_env import is_mac_os_version_supported
|
||||
from mineru.utils.cli_parser import arg_parse
|
||||
from mineru.utils.hash_utils import str_sha256
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from mineru.utils.os_env_config import get_op_num_threads
|
||||
from .table_structure_utils import (
|
||||
OrtInferSession,
|
||||
TableLabelDecode,
|
||||
@@ -29,6 +30,9 @@ class TableStructurer:
|
||||
self.preprocess_op = TablePreprocess()
|
||||
self.batch_preprocess_op = BatchTablePreprocess()
|
||||
|
||||
config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
|
||||
config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
|
||||
|
||||
self.session = OrtInferSession(config)
|
||||
|
||||
self.character = self.session.get_metadata()
|
||||
|
||||
@@ -5,6 +5,8 @@ from typing import Optional, Dict, Any, Tuple
|
||||
import cv2
|
||||
import numpy as np
|
||||
from skimage import measure
|
||||
|
||||
from mineru.utils.os_env_config import get_op_num_threads
|
||||
from .utils import OrtInferSession, resize_img
|
||||
from .utils_table_line_rec import (
|
||||
get_table_line,
|
||||
@@ -28,6 +30,9 @@ class TSRUnet:
|
||||
self.inp_height = 1024
|
||||
self.inp_width = 1024
|
||||
|
||||
config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
|
||||
config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
|
||||
|
||||
self.session = OrtInferSession(config)
|
||||
|
||||
def __call__(
|
||||
|
||||
@@ -4,6 +4,10 @@ import platform
|
||||
from packaging import version
|
||||
|
||||
|
||||
def is_windows_environment() -> bool:
|
||||
return platform.system() == "Windows"
|
||||
|
||||
|
||||
# Detect if the current environment is a Mac computer
|
||||
def is_mac_environment() -> bool:
|
||||
return platform.system() == "Darwin"
|
||||
30
mineru/utils/os_env_config.py
Normal file
30
mineru/utils/os_env_config.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
|
||||
|
||||
def get_op_num_threads(env_name: str) -> int:
|
||||
env_value = os.getenv(env_name, None)
|
||||
return get_value_from_string(env_value, -1)
|
||||
|
||||
|
||||
def get_load_images_timeout() -> int:
|
||||
env_value = os.getenv('MINERU_PDF_RENDER_TIMEOUT', None)
|
||||
return get_value_from_string(env_value, 300)
|
||||
|
||||
|
||||
def get_value_from_string(env_value: str, default_value: int) -> int:
|
||||
if env_value is not None:
|
||||
try:
|
||||
num_threads = int(env_value)
|
||||
if num_threads > 0:
|
||||
return num_threads
|
||||
except ValueError:
|
||||
return default_value
|
||||
return default_value
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(get_value_from_string('1', -1))
|
||||
print(get_value_from_string('0', -1))
|
||||
print(get_value_from_string('-1', -1))
|
||||
print(get_value_from_string('abc', -1))
|
||||
print(get_load_images_timeout())
|
||||
@@ -1,4 +1,5 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
import numpy as np
|
||||
@@ -7,9 +8,14 @@ from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.check_sys_env import is_windows_environment
|
||||
from mineru.utils.os_env_config import get_load_images_timeout
|
||||
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
||||
from .enum_class import ImageType
|
||||
from .hash_utils import str_sha256
|
||||
from mineru.utils.enum_class import ImageType
|
||||
from mineru.utils.hash_utils import str_sha256
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
|
||||
from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
|
||||
|
||||
|
||||
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
|
||||
@@ -35,7 +41,106 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
|
||||
return image_dict
|
||||
|
||||
|
||||
def _load_images_from_pdf_worker(pdf_bytes, dpi, start_page_id, end_page_id, image_type):
|
||||
"""用于进程池的包装函数"""
|
||||
return load_images_from_pdf_core(pdf_bytes, dpi, start_page_id, end_page_id, image_type)
|
||||
|
||||
|
||||
def load_images_from_pdf(
|
||||
pdf_bytes: bytes,
|
||||
dpi=200,
|
||||
start_page_id=0,
|
||||
end_page_id=None,
|
||||
image_type=ImageType.PIL,
|
||||
timeout=None,
|
||||
threads=4,
|
||||
):
|
||||
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
||||
|
||||
Args:
|
||||
pdf_bytes (bytes): PDF 文件的 bytes
|
||||
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
|
||||
start_page_id (int, optional): 起始页码. Defaults to 0.
|
||||
end_page_id (int | None, optional): 结束页码. Defaults to None.
|
||||
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
|
||||
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。
|
||||
threads (int): 进程数,默认 4
|
||||
|
||||
Raises:
|
||||
TimeoutError: 当转换超时时抛出
|
||||
"""
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
if is_windows_environment():
|
||||
# Windows 环境下不使用多进程
|
||||
return load_images_from_pdf_core(
|
||||
pdf_bytes,
|
||||
dpi,
|
||||
start_page_id,
|
||||
get_end_page_id(end_page_id, len(pdf_doc)),
|
||||
image_type
|
||||
), pdf_doc
|
||||
else:
|
||||
if timeout is None:
|
||||
timeout = get_load_images_timeout()
|
||||
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
||||
|
||||
# 计算总页数
|
||||
total_pages = end_page_id - start_page_id + 1
|
||||
|
||||
# 实际使用的进程数不超过总页数
|
||||
actual_threads = min(os.cpu_count() or 1, threads, total_pages)
|
||||
|
||||
# 根据实际进程数分组页面范围
|
||||
pages_per_thread = max(1, total_pages // actual_threads)
|
||||
page_ranges = []
|
||||
|
||||
for i in range(actual_threads):
|
||||
range_start = start_page_id + i * pages_per_thread
|
||||
if i == actual_threads - 1:
|
||||
# 最后一个进程处理剩余所有页面
|
||||
range_end = end_page_id
|
||||
else:
|
||||
range_end = start_page_id + (i + 1) * pages_per_thread - 1
|
||||
|
||||
page_ranges.append((range_start, range_end))
|
||||
|
||||
# logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
|
||||
|
||||
with ProcessPoolExecutor(max_workers=actual_threads) as executor:
|
||||
# 提交所有任务
|
||||
futures = []
|
||||
for range_start, range_end in page_ranges:
|
||||
future = executor.submit(
|
||||
_load_images_from_pdf_worker,
|
||||
pdf_bytes,
|
||||
dpi,
|
||||
range_start,
|
||||
range_end,
|
||||
image_type
|
||||
)
|
||||
futures.append((range_start, future))
|
||||
|
||||
try:
|
||||
# 收集结果并按页码排序
|
||||
all_results = []
|
||||
for range_start, future in futures:
|
||||
images_list = future.result(timeout=timeout)
|
||||
all_results.append((range_start, images_list))
|
||||
|
||||
# 按起始页码排序并合并结果
|
||||
all_results.sort(key=lambda x: x[0])
|
||||
images_list = []
|
||||
for _, imgs in all_results:
|
||||
images_list.extend(imgs)
|
||||
|
||||
return images_list, pdf_doc
|
||||
except FuturesTimeoutError:
|
||||
pdf_doc.close()
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
||||
|
||||
|
||||
def load_images_from_pdf_core(
|
||||
pdf_bytes: bytes,
|
||||
dpi=200,
|
||||
start_page_id=0,
|
||||
@@ -45,18 +150,17 @@ def load_images_from_pdf(
|
||||
images_list = []
|
||||
pdf_doc = pdfium.PdfDocument(pdf_bytes)
|
||||
pdf_page_num = len(pdf_doc)
|
||||
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
||||
if end_page_id > pdf_page_num - 1:
|
||||
logger.warning("end_page_id is out of range, use images length")
|
||||
end_page_id = pdf_page_num - 1
|
||||
end_page_id = get_end_page_id(end_page_id, pdf_page_num)
|
||||
|
||||
for index in range(0, pdf_page_num):
|
||||
if start_page_id <= index <= end_page_id:
|
||||
page = pdf_doc[index]
|
||||
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
|
||||
images_list.append(image_dict)
|
||||
for index in range(start_page_id, end_page_id + 1):
|
||||
# logger.debug(f"Converting page {index}/{pdf_page_num} to image")
|
||||
page = pdf_doc[index]
|
||||
image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
|
||||
images_list.append(image_dict)
|
||||
|
||||
return images_list, pdf_doc
|
||||
pdf_doc.close()
|
||||
|
||||
return images_list
|
||||
|
||||
|
||||
def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
|
||||
|
||||
10
mineru/utils/pdf_page_id.py
Normal file
10
mineru/utils/pdf_page_id.py
Normal file
@@ -0,0 +1,10 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
from loguru import logger
|
||||
|
||||
|
||||
def get_end_page_id(end_page_id, pdf_page_num):
|
||||
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
||||
if end_page_id > pdf_page_num - 1:
|
||||
logger.warning("end_page_id is out of range, use images length")
|
||||
end_page_id = pdf_page_num - 1
|
||||
return end_page_id
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.6.2"
|
||||
__version__ = "2.6.3"
|
||||
|
||||
Reference in New Issue
Block a user