Merge pull request #4406 from opendatalab/release-2.7.2

Release 2.7.2
2026-03-27 11:08:32 +07:00 · 2026-01-23 21:32:43 +08:00
parent db40932e6d d05577666d
commit d832388cff
23 changed files with 631 additions and 47 deletions
--- a/README.md
+++ b/README.md
@@ -45,6 +45,17 @@

 # Changelog

+- 2026/01/23 2.7.2 Release
+  - Added support for domestic computing platforms Hygon, Enflame, and Moore Threads. Currently, the officially supported domestic computing platforms include:
+    - [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend/) 
+    - [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead/) 
+    - [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX/) 
+    - [Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
+    - [Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
+    - [MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
+  - MinerU continues to ensure compatibility with domestic hardware platforms, supporting mainstream chip architectures. With secure and reliable technology, we empower researchers, government, and enterprises to reach new heights in document digitization!
+  - Cross-page table merging optimization, improving merge success rate and merge quality
+
 - 2026/01/06 2.7.1 Release
  - fix bug: #4300
  - Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -45,6 +45,17 @@

 # 更新记录

+- 2026/01/23 2.7.2 发布
+  - 新增国产算力平台海光、燧原、摩尔线程的适配支持，目前已由官方适配并支持的国产算力平台包括:
+    - [昇腾 Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend) 
+    - [平头哥 T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead) 
+    - [沐曦 METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX) 
+    - [海光 DCU](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
+    - [燧原 GCU](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
+    - [摩尔线程 MUSA](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
+  - MinerU 持续兼容国产硬件平台，支持主流芯片架构。以安全可靠的技术，助力科研、政企用户迈向文档数字化新高度！
+  - 跨页表合并优化，提升合并成功率与合并效果
+
 - 2026/01/06 2.7.1 发布
  - fix bug: #4300
  - 更新pdfminer.six的依赖版本以解决 [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
--- a/docker/china/gcu.Dockerfile
+++ b/docker/china/gcu.Dockerfile
@@ -0,0 +1,30 @@
+# Base image containing the vLLM inference environment, requiring amd64(x86-64) CPU + Enflame GCU.
+FROM crpi-vofi3w62lkohhxsp.cn-shanghai.personal.cr.aliyuncs.com/opendatalab-mineru/gcu:docker_images_topsrider_i3x_3.6.20260106_vllm0.11_pytorch2.8.0
+
+
+# Install libgl for opencv support & Noto fonts for Chinese characters
+RUN echo 'deb http://mirrors.aliyun.com/ubuntu/ noble main restricted universe multiverse\n\
+deb http://mirrors.aliyun.com/ubuntu/ noble-updates main restricted universe multiverse\n\
+deb http://mirrors.aliyun.com/ubuntu/ noble-backports main restricted universe multiverse\n\
+deb http://mirrors.aliyun.com/ubuntu/ noble-security main restricted universe multiverse' > /tmp/aliyun-sources.list && \
+    apt-get -o Dir::Etc::SourceList=/tmp/aliyun-sources.list update && \
+    apt-get -o Dir::Etc::SourceList=/tmp/aliyun-sources.list install -y \
+        fonts-noto-core \
+        fonts-noto-cjk \
+        fontconfig && \
+    fc-cache -fv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/aliyun-sources.list
+
+# Install mineru latest
+RUN python3 -m pip install "mineru[core]>=2.7.2" \
+                            numpy==1.26.4 \
+                            opencv-python==4.11.0.86 \
+                            -i https://mirrors.aliyun.com/pypi/simple && \
+    python3 -m pip cache purge
+
+# Download models and update the configuration file
+RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
+
+# Set the entry point to activate the virtual environment and run the command line tool
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
--- a/docker/china/musa.Dockerfile
+++ b/docker/china/musa.Dockerfile
@@ -0,0 +1,38 @@
+# Base image containing the vLLM inference environment, requiring amd64(x86-64) CPU + MooreThreads GPU.
+FROM registry.mthreads.com/mcconline/vllm-musa-qy2-py310:v0.8.4-release
+
+
+# Install libgl for opencv support & Noto fonts for Chinese characters
+RUN apt-get update && \
+    apt-get install -y \
+        fonts-noto-core \
+        fonts-noto-cjk \
+        fontconfig \
+        libgl1 && \
+    fc-cache -fv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install mineru latest
+RUN python3 -m pip install -U pip -i https://mirrors.aliyun.com/pypi/simple && \
+    git clone https://gitcode.com/gh_mirrors/vi/vision.git -b v0.20.0 --depth 1 && \
+    cd vision && \
+    python3 setup.py install && \
+    python3 -m pip install "mineru[api,gradio]>=2.7.2" \
+                            "matplotlib>=3.10,<4" \
+                            "ultralytics>=8.3.48,<9" \
+                            "doclayout_yolo==0.0.4" \
+                            "ftfy>=6.3.1,<7" \
+                            "shapely>=2.0.7,<3" \
+                            "pyclipper>=1.3.0,<2" \
+                            "omegaconf>=2.3.0,<3" \
+                            numpy==1.26.4 \
+                            opencv-python==4.11.0.86 \
+                            -i https://mirrors.aliyun.com/pypi/simple && \
+    python3 -m pip cache purge
+
+# Download models and update the configuration file
+RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
+
+# Set the entry point to activate the virtual environment and run the command line tool
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
--- a/docs/en/demo/index.md
+++ b/docs/en/demo/index.md
@@ -1,2 +1 @@
-<script type="module" src="https://gradio.s3-us-west-2.amazonaws.com/5.35.0/gradio.js"></script>
-<gradio-app src="https://opendatalab-mineru.hf.space"></gradio-app>
+<iframe src="https://opendatalab-mineru.hf.space" style="min-height: calc(-40px + 100vh); width: 100%; flex-grow: 1; border: medium; overflow: auto; height: 1746px;"></iframe>
--- a/docs/en/reference/output_files.md
+++ b/docs/en/reference/output_files.md
@@ -29,12 +29,12 @@ The following sections provide detailed descriptions of each file's purpose and

 ![layout page example](../images/layout_example.png)

-### Text Spans File (spans.pdf)
+### Text Spans File (span.pdf)

 > [!NOTE]
 > Only applicable to pipeline backend

-**File naming format**: `{original_filename}_spans.pdf`
+**File naming format**: `{original_filename}_span.pdf`

 **Functionality**:

@@ -702,7 +702,7 @@ The above files constitute MinerU's complete output results. Users can choose ap
  
 - **Debugging and verification** (Use visualization files):
    * layout.pdf
-    * spans.pdf 
+    * span.pdf 
  
 - **Content extraction**: (Use simplified files):
    * *.md
--- a/docs/en/usage/cli_tools.md
+++ b/docs/en/usage/cli_tools.md
@@ -125,3 +125,9 @@ Here are the environment variables and their descriptions:
 - `MINERU_HYBRID_FORCE_PIPELINE_ENABLE`:
    * Used to force the text extraction part in `hybrid-*` backends to be processed using small models.
    * Defaults to `false`. Can be set to `true` via environment variable to enable this feature, thereby reducing hallucinations in certain extreme cases.
+
+- `MINERU_VL_MODEL_NAME`:
+    * Used to specify the model name for the vlm/hybrid backend, allowing you to designate the model required for MinerU to run when multiple models exist on a remote openai-server.
+
+- `MINERU_VL_API_KEY`:
+    * Used to specify the API Key for the vlm/hybrid backend, enabling authentication on the remote openai-server.
--- a/docs/zh/reference/output_files.md
+++ b/docs/zh/reference/output_files.md
@@ -29,12 +29,12 @@

 ![layout 页面示例](../images/layout_example.png)

-### 文本片段文件 (spans.pdf)
+### 文本片段文件 (span.pdf)

 > [!NOTE]
 > 仅适用于 pipeline 后端

-**文件命名格式**：`{原文件名}_spans.pdf`
+**文件命名格式**：`{原文件名}_span.pdf`

 **功能说明**：

@@ -817,7 +817,7 @@ vlm 后端的 content_list.json 文件结构与 pipeline 后端类似，伴随
  
 - **调试和验证**(使用可视化文件):
    * layout.pdf
-    * spans.pdf 
+    * span.pdf 
  
 - **内容提取**(使用简化文件):
    * *.md
--- a/docs/zh/usage/acceleration_cards/Enflame.md
+++ b/docs/zh/usage/acceleration_cards/Enflame.md
@@ -0,0 +1,109 @@
+## 1. 测试平台
+以下为本指南测试使用的平台信息，供参考：
+```
+os: Ubuntu 22.04.4 LTS  
+cpu: Intel x86-64
+gcu: Enflame S60 
+driver: 1.7.0.9
+docker: 28.0.1
+```
+
+## 2. 环境准备
+
+### 2.1 使用 Dockerfile 构建镜像
+
+```bash
+wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/gcu.Dockerfile
+docker build --network=host -t mineru:gcu-vllm-latest -f gcu.Dockerfile .
+```
+
+
+## 3. 启动 Docker 容器
+
+```bash
+docker run -u root --name mineru_docker \
+    --network=host \
+    --ipc=host \
+    --privileged \
+    -e MINERU_MODEL_SOURCE=local \
+    -it mineru:gcu-vllm-latest \
+    /bin/bash
+```
+
+执行该命令后，您将进入到Docker容器的交互式终端，您可以直接在容器内运行MinerU相关命令来使用MinerU的功能。
+您也可以直接通过替换`/bin/bash`为服务启动命令来启动MinerU服务，详细说明请参考[通过命令启动服务](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#apiwebuihttp-clientserver)。
+
+
+## 4. 注意事项
+
+不同环境下，MinerU对Enflame加速卡的支持情况如下表所示：
+
+<table border="1">
+  <thead>
+    <tr>
+      <th rowspan="2" colspan="2">使用场景</th>
+      <th colspan="2">容器环境</th>
+    </tr>
+    <tr>
+      <th>vllm</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td rowspan="3">命令行工具(mineru)</td>
+      <td>pipeline</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-auto-engine</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-http-client</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td rowspan="3">fastapi服务(mineru-api)</td>
+      <td>pipeline</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-auto-engine</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-http-client</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td rowspan="3">gradio界面(mineru-gradio)</td>
+      <td>pipeline</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-auto-engine</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-http-client</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td colspan="2">openai-server服务（mineru-openai-server）</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td colspan="2">数据并行 (--data-parallel-size)</td>
+      <td>🔴</td>
+    </tr>
+  </tbody>
+</table>
+
+注：  
+🟢: 支持，运行较稳定，精度与Nvidia GPU基本一致  
+🟡: 支持但较不稳定，在某些场景下可能出现异常，或精度存在一定差异  
+🔴: 不支持，无法运行，或精度存在较大差异
+
+>[!TIP]
+>GCU加速卡指定可用加速卡的方式与NVIDIA GPU类似，请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明,
+>将环境变量`CUDA_VISIBLE_DEVICES`替换为`TOPS_VISIBLE_DEVICES`即可。 
--- a/docs/zh/usage/acceleration_cards/MooreThreads.md
+++ b/docs/zh/usage/acceleration_cards/MooreThreads.md
@@ -0,0 +1,115 @@
+## 1. 测试平台
+以下为本指南测试使用的平台信息，供参考：
+```
+os: Ubuntu 22.04.4 LTS  
+cpu: Intel x86-64
+dcu: MTT S4000
+driver: 3.0.0-rc-KuaE2.0
+docker: 24.0.7
+```
+
+## 2. 环境准备
+
+### 2.1 使用 Dockerfile 构建镜像
+
+```bash
+wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/musa.Dockerfile
+docker build --network=host -t mineru:musa-vllm-latest -f musa.Dockerfile .
+```
+
+
+## 3. 启动 Docker 容器
+
+```bash
+docker run -u root --name mineru_docker \
+    --network=host \
+    --ipc=host \
+    --shm-size=80g \
+    --privileged \
+    -e MTHREADS_VISIBLE_DEVICES=all \
+    -e MINERU_MODEL_SOURCE=local \
+    -it mineru:musa-vllm-latest \
+    /bin/bash
+```
+
+执行该命令后，您将进入到Docker容器的交互式终端，您可以直接在容器内运行MinerU相关命令来使用MinerU的功能。
+您也可以直接通过替换`/bin/bash`为服务启动命令来启动MinerU服务，详细说明请参考[通过命令启动服务](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#apiwebuihttp-clientserver)。
+
+
+## 4. 注意事项
+
+不同环境下，MinerU对MooreThreads加速卡的支持情况如下表所示：
+
+>[!NOTE]
+> **兼容性说明**：由于摩尔线程（MooreThreads）目前对 vLLM v1 引擎的支持尚待完善，MinerU 现阶段采用 v0 引擎作为适配方案。
+> 受此限制，vLLM 的异步引擎（Async Engine）功能存在兼容性问题，可能导致部分使用场景无法正常运行。
+> 我们将持续跟进摩尔线程对 vLLM v1 引擎的支持进展，并及时在 MinerU 中进行相应的适配与优化。
+
+<table border="1">
+  <thead>
+    <tr>
+      <th rowspan="2" colspan="2">使用场景</th>
+      <th colspan="2">容器环境</th>
+    </tr>
+    <tr>
+      <th>vllm</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td rowspan="3">命令行工具(mineru)</td>
+      <td>pipeline</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-auto-engine</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-http-client</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td rowspan="3">fastapi服务(mineru-api)</td>
+      <td>pipeline</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-auto-engine</td>
+      <td>🔴</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-http-client</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td rowspan="3">gradio界面(mineru-gradio)</td>
+      <td>pipeline</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-auto-engine</td>
+      <td>🔴</td>
+    </tr>
+    <tr>
+      <td>&lt;vlm/hybrid&gt;-http-client</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td colspan="2">openai-server服务（mineru-openai-server）</td>
+      <td>🟢</td>
+    </tr>
+    <tr>
+      <td colspan="2">数据并行 (--data-parallel-size)</td>
+      <td>🔴</td>
+    </tr>
+  </tbody>
+</table>
+
+注：  
+🟢: 支持，运行较稳定，精度与Nvidia GPU基本一致  
+🟡: 支持但较不稳定，在某些场景下可能出现异常，或精度存在一定差异  
+🔴: 不支持，无法运行，或精度存在较大差异
+
+>[!TIP]
+>MooreThreads加速卡指定可用加速卡的方式与NVIDIA GPU类似，请参考[GPU 枚举](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/install_guide/#gpu-%E6%9E%9A%E4%B8%BE)
--- a/docs/zh/usage/cli_tools.md
+++ b/docs/zh/usage/cli_tools.md
@@ -119,4 +119,10 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置，

 - `MINERU_HYBRID_FORCE_PIPELINE_ENABLE`：
    * 用于强制将 hybrid-* 后端中的 文本提取部分使用 小模型 进行处理
-    * 默认为`false`，可通过环境变量设置为`true`来启用该功能，从而在某些极端情况下减少幻觉的发生。
+    * 默认为`false`，可通过环境变量设置为`true`来启用该功能，从而在某些极端情况下减少幻觉的发生。
+
+- `MINERU_VL_MODEL_NAME`：
+    * 用于指定 vlm/hybrid 后端使用的模型名称，这将允许您在同时存在多个模型的远程openai-server中指定 MinerU 运行所需的模型。
+
+- `MINERU_VL_API_KEY`:
+    * 用于指定 vlm/hybrid 后端使用的API Key，这将允许您在远程openai-server中进行身份验证。
--- a/docs/zh/usage/index.md
+++ b/docs/zh/usage/index.md
@@ -13,6 +13,8 @@
    * [平头哥 T-Head](acceleration_cards/THead.md) 🚀
    * [沐曦 METAX](acceleration_cards/METAX.md) 🚀
    * [海光 Hygon](acceleration_cards/Hygon.md) 🚀
+    * [燧原 Enflame](acceleration_cards/Enflame.md) 🚀
+    * [摩尔线程 MooreThreads](acceleration_cards/MooreThreads.md) 🚀
    * [AMD](acceleration_cards/AMD.md)  [#3662](https://github.com/opendatalab/MinerU/discussions/3662) ❤️
    * [太初元碁 Tecorigin](acceleration_cards/Tecorigin.md) [#3767](https://github.com/opendatalab/MinerU/pull/3767) ❤️
    * [寒武纪 Cambricon](acceleration_cards/Cambricon.md) [#4004](https://github.com/opendatalab/MinerU/discussions/4004) ❤️
--- a/mineru/backend/vlm/utils.py
+++ b/mineru/backend/vlm/utils.py
@@ -18,6 +18,10 @@ def enable_custom_logits_processors() -> bool:
        compute_capability = f"{major}.{minor}"
    elif hasattr(torch, 'npu') and torch.npu.is_available():
        compute_capability = "8.0"
+    elif hasattr(torch, 'gcu') and torch.gcu.is_available():
+        compute_capability = "8.0"
+    elif hasattr(torch, 'musa') and torch.musa.is_available():
+        compute_capability = "8.0"
    else:
        logger.info("CUDA not available, disabling custom_logits_processors")
        return False
--- a/mineru/backend/vlm/vlm_analyze.py
+++ b/mineru/backend/vlm/vlm_analyze.py
@@ -1,6 +1,7 @@
 # Copyright (c) Opendatalab. All rights reserved.
 import os
 import time
+import json

 from loguru import logger

@@ -99,6 +100,30 @@ class ModelSingleton:
                        import vllm
                    except ImportError:
                        raise ImportError("Please install vllm to use the vllm-engine backend.")
+
+                    """
+                    # musa vllm v1 引擎特殊配置
+                    device = get_device()
+                    if device.startswith("musa"):
+                        import torch
+                        if torch.musa.is_available():
+                            compilation_config = {
+                                "cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
+                                "simple_cuda_graph": True
+                            }
+                            block_size = 32
+                            kwargs["compilation_config"] = compilation_config
+                            kwargs["block_size"] = block_size
+                    """
+
+                    if "compilation_config" in kwargs:
+                        if isinstance(kwargs["compilation_config"], str):
+                            try:
+                                kwargs["compilation_config"] = json.loads(kwargs["compilation_config"])
+                            except json.JSONDecodeError:
+                                logger.warning(
+                                    f"Failed to parse compilation_config as JSON: {kwargs['compilation_config']}")
+                                del kwargs["compilation_config"]
                    if "gpu_memory_utilization" not in kwargs:
                        kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
                    if "model" not in kwargs:
@@ -112,8 +137,38 @@ class ModelSingleton:
                    try:
                        from vllm.engine.arg_utils import AsyncEngineArgs
                        from vllm.v1.engine.async_llm import AsyncLLM
+                        from vllm.config import CompilationConfig
                    except ImportError:
                        raise ImportError("Please install vllm to use the vllm-async-engine backend.")
+
+                    """
+                    # musa vllm v1 引擎特殊配置
+                    device = get_device()
+                    if device.startswith("musa"):
+                        import torch
+                        if torch.musa.is_available():
+                            compilation_config = CompilationConfig(
+                                cudagraph_capture_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
+                                simple_cuda_graph=True
+                            )
+                            block_size = 32
+                            kwargs["compilation_config"] = compilation_config
+                            kwargs["block_size"] = block_size
+                    """
+
+                    if "compilation_config" in kwargs:
+                        if isinstance(kwargs["compilation_config"], dict):
+                            # 如果是字典，转换为 CompilationConfig 对象
+                            kwargs["compilation_config"] = CompilationConfig(**kwargs["compilation_config"])
+                        elif isinstance(kwargs["compilation_config"], str):
+                            # 如果是 JSON 字符串，先解析再转换
+                            try:
+                                config_dict = json.loads(kwargs["compilation_config"])
+                                kwargs["compilation_config"] = CompilationConfig(**config_dict)
+                            except (json.JSONDecodeError, TypeError) as e:
+                                logger.warning(
+                                    f"Failed to parse compilation_config: {kwargs['compilation_config']}, error: {e}")
+                                del kwargs["compilation_config"]
                    if "gpu_memory_utilization" not in kwargs:
                        kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
                    if "model" not in kwargs:
--- a/mineru/model/mfd/yolo_v8.py
+++ b/mineru/model/mfd/yolo_v8.py
@@ -1,5 +1,7 @@
 import os
 from typing import List, Union
+
+import torch
 from tqdm import tqdm
 from ultralytics import YOLO
 import numpy as np
@@ -18,8 +20,8 @@ class YOLOv8MFDModel:
        conf: float = 0.25,
        iou: float = 0.45,
    ):
-        self.model = YOLO(weight).to(device)
-        self.device = device
+        self.device = torch.device(device)
+        self.model = YOLO(weight).to(self.device)
        self.imgsz = imgsz
        self.conf = conf
        self.iou = iou
--- a/mineru/model/mfr/unimernet/Unimernet.py
+++ b/mineru/model/mfr/unimernet/Unimernet.py
@@ -23,12 +23,12 @@ class MathDataset(Dataset):
 class UnimernetModel(object):
    def __init__(self, weight_dir, _device_="cpu"):
        from .unimernet_hf import UnimernetModel
-        if _device_.startswith("mps") or _device_.startswith("npu"):
+        if _device_.startswith("mps") or _device_.startswith("npu") or _device_.startswith("musa"):
            self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
        else:
            self.model = UnimernetModel.from_pretrained(weight_dir)
-        self.device = _device_
-        self.model.to(_device_)
+        self.device = torch.device(_device_)
+        self.model.to(self.device)
        if not _device_.startswith("cpu"):
            self.model = self.model.to(dtype=torch.float16)
        self.model.eval()
--- a/mineru/model/table/rec/unet_table/utils_table_line_rec.py
+++ b/mineru/model/table/rec/unet_table/utils_table_line_rec.py
@@ -4,6 +4,8 @@ import cv2
 import numpy as np
 from scipy.spatial import distance as dist
 from skimage import measure
+from skimage import __version__ as skimage_version
+from packaging import version


 def transform_preds(coords, center, scale, output_size, rot=0):
@@ -295,7 +297,11 @@ def min_area_rect_box(
    """
    boxes = []
    for region in regions:
-        if region.bbox_area > H * W * 3 / 4:  # 过滤大的单元格
+        if version.parse(skimage_version) >= version.parse("0.26.0"):
+            region_bbox_area = region.area_bbox
+        else:
+            region_bbox_area = region.bbox_area
+        if region_bbox_area > H * W * 3 / 4:  # 过滤大的单元格
            continue
        rect = cv2.minAreaRect(region.coords[:, ::-1])

--- a/mineru/model/vlm/vllm_server.py
+++ b/mineru/model/vlm/vllm_server.py
@@ -2,6 +2,7 @@ import os
 import sys

 from mineru.backend.vlm.utils import set_default_gpu_memory_utilization, enable_custom_logits_processors
+from mineru.utils.config_reader import get_device
 from mineru.utils.models_download_utils import auto_download_and_get_model_root_path

 from vllm.entrypoints.cli.main import main as vllm_main
@@ -13,6 +14,8 @@ def main():
    has_port_arg = False
    has_gpu_memory_utilization_arg = False
    has_logits_processors_arg = False
+    has_block_size_arg = False
+    has_compilation_config = False
    model_path = None
    model_arg_indices = []

@@ -24,6 +27,10 @@ def main():
            has_gpu_memory_utilization_arg = True
        if arg == "--logits-processors" or arg.startswith("--logits-processors="):
            has_logits_processors_arg = True
+        if arg == "--block-size" or arg.startswith("--block-size="):
+            has_block_size_arg = True
+        if arg == "--compilation-config" or arg.startswith("--compilation-config="):
+            has_compilation_config = True
        if arg == "--model":
            if i + 1 < len(args):
                model_path = args[i + 1]
@@ -49,6 +56,17 @@ def main():
        model_path = auto_download_and_get_model_root_path("/", "vlm")
    if (not has_logits_processors_arg) and custom_logits_processors:
        args.extend(["--logits-processors", "mineru_vl_utils:MinerULogitsProcessor"])
+    """
+    # musa vllm v1 引擎特殊配置 
+    device = get_device()
+    if device.startswith("musa"):
+        import torch
+        if torch.musa.is_available():
+            if not has_block_size_arg:
+                args.extend(["--block-size", "32"])
+            if not has_compilation_config:
+                args.extend(["--compilation-config", '{"cudagraph_capture_sizes": [1,2,3,4,5,6,7,8,10,12,14,16,18,20,24,28,30], "simple_cuda_graph": true}'])
+    """

    # 重构参数，将模型路径作为位置参数
    sys.argv = [sys.argv[0]] + ["serve", model_path] + args
--- a/mineru/utils/block_sort.py
+++ b/mineru/utils/block_sort.py
@@ -186,6 +186,18 @@ def model_init(model_name: str):
            bf_16_support = True
    elif device_name.startswith("mps"):
        bf_16_support = True
+    elif device_name.startswith("gcu"):
+        if hasattr(torch, 'gcu') and torch.gcu.is_available():
+            if torch.gcu.is_bf16_supported():
+                bf_16_support = True
+    elif device_name.startswith("musa"):
+        if hasattr(torch, 'musa') and torch.musa.is_available():
+            if torch.musa.is_bf16_supported():
+                bf_16_support = True
+    elif device_name.startswith("npu"):
+        if hasattr(torch, 'npu') and torch.npu.is_available():
+            if torch.npu.is_bf16_supported():
+                bf_16_support = True

    if model_name == 'layoutreader':
        # 检测modelscope的缓存目录是否存在
--- a/mineru/utils/config_reader.py
+++ b/mineru/utils/config_reader.py
@@ -86,7 +86,15 @@ def get_device():
                if torch_npu.npu.is_available():
                    return "npu"
            except Exception as e:
-                pass
+                try:
+                    if torch.gcu.is_available():
+                        return "gcu"
+                except Exception as e:
+                    try:
+                        if torch.musa.is_available():
+                            return "musa"
+                    except Exception as e:
+                        pass
        return "cpu"


--- a/mineru/utils/model_utils.py
+++ b/mineru/utils/model_utils.py
@@ -414,7 +414,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol


 def clean_memory(device='cuda'):
-    if device == 'cuda':
+    if str(device).startswith("cuda"):
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
@@ -423,6 +423,12 @@ def clean_memory(device='cuda'):
            torch_npu.npu.empty_cache()
    elif str(device).startswith("mps"):
        torch.mps.empty_cache()
+    elif str(device).startswith("gcu"):
+        if torch.gcu.is_available():
+            torch.gcu.empty_cache()
+    elif str(device).startswith("musa"):
+        if torch.musa.is_available():
+            torch.musa.empty_cache()
    gc.collect()


@@ -458,5 +464,11 @@ def get_vram(device) -> int:
    elif str(device).startswith("npu"):
        if torch_npu.npu.is_available():
            total_memory = round(torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3))  # 转为 GB
+    elif str(device).startswith("gcu"):
+        if torch.gcu.is_available():
+            total_memory = round(torch.gcu.get_device_properties(device).total_memory / (1024 ** 3))  # 转为 GB
+    elif str(device).startswith("musa"):
+        if torch.musa.is_available():
+            total_memory = round(torch.musa.get_device_properties(device).total_memory / (1024 ** 3))  # 转为 GB

    return total_memory
--- a/mineru/utils/table_merge.py
+++ b/mineru/utils/table_merge.py
@@ -17,6 +17,7 @@ CONTINUATION_END_MARKERS = [
    "(cont.)",
    "(cont’d)",
    "(…continued)",
+    "续表",
 ]

 CONTINUATION_INLINE_MARKERS = [
@@ -69,6 +70,69 @@ def calculate_table_total_columns(soup):
    return max_cols


+def build_table_occupied_matrix(soup):
+    """构建表格的占用矩阵，返回每行的有效列数
+
+    Args:
+        soup: BeautifulSoup解析的表格
+
+    Returns:
+        dict: {row_idx: effective_columns} 每行的有效列数（考虑rowspan占用）
+    """
+    rows = soup.find_all("tr")
+    if not rows:
+        return {}
+
+    occupied = {}  # {row_idx: {col_idx: True}}
+    row_effective_cols = {}  # {row_idx: effective_columns}
+
+    for row_idx, row in enumerate(rows):
+        col_idx = 0
+        cells = row.find_all(["td", "th"])
+
+        if row_idx not in occupied:
+            occupied[row_idx] = {}
+
+        for cell in cells:
+            # 找到下一个未被占用的列位置
+            while col_idx in occupied[row_idx]:
+                col_idx += 1
+
+            colspan = int(cell.get("colspan", 1))
+            rowspan = int(cell.get("rowspan", 1))
+
+            # 标记被这个单元格占用的所有位置
+            for r in range(row_idx, row_idx + rowspan):
+                if r not in occupied:
+                    occupied[r] = {}
+                for c in range(col_idx, col_idx + colspan):
+                    occupied[r][c] = True
+
+            col_idx += colspan
+
+        # 该行的有效列数为已占用的最大列索引+1
+        if occupied[row_idx]:
+            row_effective_cols[row_idx] = max(occupied[row_idx].keys()) + 1
+        else:
+            row_effective_cols[row_idx] = 0
+
+    return row_effective_cols
+
+
+def calculate_row_effective_columns(soup, row_idx):
+    """计算指定行的有效列数（考虑rowspan占用）
+
+    Args:
+        soup: BeautifulSoup解析的表格
+        row_idx: 行索引
+
+    Returns:
+        int: 该行的有效列数
+    """
+    row_effective_cols = build_table_occupied_matrix(soup)
+    return row_effective_cols.get(row_idx, 0)
+
+
 def calculate_row_columns(row):
    """
    计算表格行的实际列数，考虑colspan属性
@@ -118,6 +182,10 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
    rows1 = soup1.find_all("tr")
    rows2 = soup2.find_all("tr")

+    # 构建两个表格的有效列数矩阵
+    effective_cols1 = build_table_occupied_matrix(soup1)
+    effective_cols2 = build_table_occupied_matrix(soup2)
+
    min_rows = min(len(rows1), len(rows2), max_header_rows)
    header_rows = 0
    headers_match = True
@@ -135,20 +203,24 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
        if len(cells1) != len(cells2):
            structure_match = False
        else:
-            # 然后检查单元格的属性和内容
-            for cell1, cell2 in zip(cells1, cells2):
-                colspan1 = int(cell1.get("colspan", 1))
-                rowspan1 = int(cell1.get("rowspan", 1))
-                colspan2 = int(cell2.get("colspan", 1))
-                rowspan2 = int(cell2.get("rowspan", 1))
+            # 检查有效列数是否一致（考虑rowspan影响）
+            if effective_cols1.get(i, 0) != effective_cols2.get(i, 0):
+                structure_match = False
+            else:
+                # 然后检查单元格的属性和内容
+                for cell1, cell2 in zip(cells1, cells2):
+                    colspan1 = int(cell1.get("colspan", 1))
+                    rowspan1 = int(cell1.get("rowspan", 1))
+                    colspan2 = int(cell2.get("colspan", 1))
+                    rowspan2 = int(cell2.get("rowspan", 1))

-                # 去除所有空白字符（包括空格、换行、制表符等）
-                text1 = ''.join(full_to_half(cell1.get_text()).split())
-                text2 = ''.join(full_to_half(cell2.get_text()).split())
+                    # 去除所有空白字符（包括空格、换行、制表符等）
+                    text1 = ''.join(full_to_half(cell1.get_text()).split())
+                    text2 = ''.join(full_to_half(cell2.get_text()).split())

-                if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
-                    structure_match = False
-                    break
+                    if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
+                        structure_match = False
+                        break

        if structure_match:
            header_rows += 1
@@ -158,7 +230,54 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
            headers_match = header_rows > 0  # 只有当至少匹配了一行时，才认为表头匹配
            break

-    # 如果没有找到匹配的表头行，则返回失败
+    # 如果严格匹配失败，尝试视觉一致性匹配（只比较文本内容）
+    if header_rows == 0:
+        header_rows, headers_match, header_texts = _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows)
+
+    return header_rows, headers_match, header_texts
+
+
+def _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows=5):
+    """
+    基于视觉一致性检测表头（只比较文本内容，忽略colspan/rowspan差异）
+
+    Args:
+        soup1: 第一个表格的BeautifulSoup对象
+        soup2: 第二个表格的BeautifulSoup对象
+        rows1: 第一个表格的行列表
+        rows2: 第二个表格的行列表
+        max_header_rows: 最大可能的表头行数
+
+    Returns:
+        tuple: (表头行数, 表头是否一致, 表头文本列表)
+    """
+    # 构建两个表格的有效列数矩阵
+    effective_cols1 = build_table_occupied_matrix(soup1)
+    effective_cols2 = build_table_occupied_matrix(soup2)
+
+    min_rows = min(len(rows1), len(rows2), max_header_rows)
+    header_rows = 0
+    headers_match = True
+    header_texts = []
+
+    for i in range(min_rows):
+        cells1 = rows1[i].find_all(["td", "th"])
+        cells2 = rows2[i].find_all(["td", "th"])
+
+        # 提取每行的文本内容列表（去除空白字符）
+        texts1 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells1]
+        texts2 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells2]
+
+        # 检查视觉一致性：文本内容完全相同，且有效列数一致
+        effective_cols_match = effective_cols1.get(i, 0) == effective_cols2.get(i, 0)
+        if texts1 == texts2 and effective_cols_match:
+            header_rows += 1
+            row_texts = [full_to_half(cell.get_text().strip()) for cell in cells1]
+            header_texts.append(row_texts)
+        else:
+            headers_match = header_rows > 0
+            break
+
    if header_rows == 0:
        headers_match = False

@@ -243,34 +362,44 @@ def check_rows_match(soup1, soup2):
    if not (rows1 and rows2):
        return False

-    # 获取第一个表的最后一行数据行
+    # 获取第一个表的最后一行数据行索引
+    last_row_idx = None
    last_row = None
-    for row in reversed(rows1):
-        if row.find_all(["td", "th"]):
-            last_row = row
+    for idx in range(len(rows1) - 1, -1, -1):
+        if rows1[idx].find_all(["td", "th"]):
+            last_row_idx = idx
+            last_row = rows1[idx]
            break

    # 检测表头行数，以便获取第二个表的首个数据行
    header_count, _, _ = detect_table_headers(soup1, soup2)

    # 获取第二个表的首个数据行
+    first_data_row_idx = None
    first_data_row = None
    if len(rows2) > header_count:
+        first_data_row_idx = header_count
        first_data_row = rows2[header_count]  # 第一个非表头行

    if not (last_row and first_data_row):
        return False

-    # 计算实际列数（考虑colspan）和视觉列数
+    # 计算有效列数（考虑rowspan和colspan）
+    last_row_effective_cols = calculate_row_effective_columns(soup1, last_row_idx)
+    first_row_effective_cols = calculate_row_effective_columns(soup2, first_data_row_idx)
+
+    # 计算实际列数（仅考虑colspan）和视觉列数
    last_row_cols = calculate_row_columns(last_row)
    first_row_cols = calculate_row_columns(first_data_row)
    last_row_visual_cols = calculate_visual_columns(last_row)
    first_row_visual_cols = calculate_visual_columns(first_data_row)

-    # logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(视觉列数:{first_row_visual_cols})")
+    # logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(有效列数:{last_row_effective_cols}, 视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(有效列数:{first_row_effective_cols}, 视觉列数:{first_row_visual_cols})")

-    # 同时考虑实际列数匹配和视觉列数匹配
-    return last_row_cols == first_row_cols or last_row_visual_cols == first_row_visual_cols
+    # 同时考虑有效列数匹配、实际列数匹配和视觉列数匹配
+    return (last_row_effective_cols == first_row_effective_cols or
+            last_row_cols == first_row_cols or
+            last_row_visual_cols == first_row_visual_cols)


 def check_row_columns_match(row1, row2):
@@ -287,12 +416,13 @@ def check_row_columns_match(row1, row2):
    return True


-def adjust_table_rows_colspan(rows, start_idx, end_idx,
+def adjust_table_rows_colspan(soup, rows, start_idx, end_idx,
                              reference_structure, reference_visual_cols,
                              target_cols, current_cols, reference_row):
    """调整表格行的colspan属性以匹配目标列数

    Args:
+        soup: BeautifulSoup解析的表格对象（用于计算有效列数）
        rows: 表格行列表
        start_idx: 起始行索引
        end_idx: 结束行索引（不包含）
@@ -304,14 +434,21 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
    """
    reference_row_copy = deepcopy(reference_row)

+    # 构建有效列数矩阵
+    effective_cols_matrix = build_table_occupied_matrix(soup)
+
    for i in range(start_idx, end_idx):
        row = rows[i]
        cells = row.find_all(["td", "th"])
        if not cells:
            continue

+        # 使用有效列数（考虑rowspan）判断是否需要调整
+        current_row_effective_cols = effective_cols_matrix.get(i, 0)
        current_row_cols = calculate_row_columns(row)
-        if current_row_cols >= target_cols:
+
+        # 如果有效列数或实际列数已经达到目标，则跳过
+        if current_row_effective_cols >= target_cols or current_row_cols >= target_cols:
            continue

        # 检查是否与参考行结构匹配
@@ -323,9 +460,12 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
                        cell["colspan"] = str(reference_structure[j])
        else:
            # 扩展最后一个单元格以填补列数差异
-            last_cell = cells[-1]
-            current_last_span = int(last_cell.get("colspan", 1))
-            last_cell["colspan"] = str(current_last_span + (target_cols - current_cols))
+            # 使用有效列数来计算差异
+            cols_diff = target_cols - current_row_effective_cols
+            if cols_diff > 0:
+                last_cell = cells[-1]
+                current_last_span = int(last_cell.get("colspan", 1))
+                last_cell["colspan"] = str(current_last_span + cols_diff)


 def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
@@ -356,7 +496,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
            reference_visual_cols = calculate_visual_columns(last_row1)
            # 以表1的最后一行为参考，调整表2的行
            adjust_table_rows_colspan(
-                rows2, header_count, len(rows2),
+                soup2, rows2, header_count, len(rows2),
                reference_structure, reference_visual_cols,
                table_cols1, table_cols2, first_data_row2
            )
@@ -366,7 +506,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
            reference_visual_cols = calculate_visual_columns(first_data_row2)
            # 以表2的第一个数据行为参考，调整表1的行
            adjust_table_rows_colspan(
-                rows1, 0, len(rows1),
+                soup1, rows1, 0, len(rows1),
                reference_structure, reference_visual_cols,
                table_cols2, table_cols1, last_row1
            )
@@ -445,4 +585,4 @@ def merge_table(page_info_list):
        # 删除当前页的table
        for block in current_table_block["blocks"]:
            block['lines'] = []
-            block[SplitFlag.LINES_DELETED] = True
+            block[SplitFlag.LINES_DELETED] = True
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,7 +70,7 @@ pipeline = [
    "ultralytics>=8.3.48,<9",
    "doclayout_yolo==0.0.4",
    "dill>=0.3.8,<1",
-    "PyYAML>=6.0.2,<7",
+    "PyYAML>=6.0.1,<7",
    "ftfy>=6.3.1,<7",
    "shapely>=2.0.7,<3",
    "pyclipper>=1.3.0,<2",