Compare commits

...

71 Commits

Author SHA1 Message Date
Xiaomeng Zhao
d05577666d Merge pull request #4409 from opendatalab/dev
Dev
2026-01-23 21:32:25 +08:00
Xiaomeng Zhao
cde5d02ffc Merge pull request #4408 from myhloli/dev
Dev
2026-01-23 21:28:45 +08:00
myhloli
cb0f3bc7a8 fix: update links for domestic computing platforms in README_zh-CN.md 2026-01-23 21:26:58 +08:00
myhloli
d21cbb3ef6 fix: update links for domestic computing platforms in README.md 2026-01-23 21:26:26 +08:00
Xiaomeng Zhao
d17d69e479 Merge pull request #4407 from myhloli/dev
Dev
2026-01-23 21:25:51 +08:00
myhloli
cab3fad4d5 feat: add support for domestic computing platforms in README.md 2026-01-23 21:25:09 +08:00
myhloli
26bc197599 feat: add compatibility note for domestic hardware platforms in README_zh-CN.md 2026-01-23 21:21:06 +08:00
myhloli
3e6aa26c5a refactor: clean up unused lines in table_merge.py 2026-01-23 21:17:45 +08:00
myhloli
2e0b836539 feat: enhance bf16 support checks for gcu, musa, and npu devices 2026-01-23 21:16:15 +08:00
myhloli
ff97c2bf00 feat: update Enflame.md driver formatting and enhance README_zh-CN.md with additional国产算力平台适配信息 2026-01-23 21:12:30 +08:00
Xiaomeng Zhao
ce620fedc9 Merge pull request #4405 from myhloli/dev
Dev
2026-01-23 21:02:08 +08:00
myhloli
8c1aed8627 feat: update MooreThreads.md for compatibility notes and modify musa.Dockerfile to install vision package 2026-01-23 20:57:31 +08:00
myhloli
bc18bc0665 feat: update musa Dockerfile version and modify package installation for compatibility 2026-01-23 20:20:49 +08:00
myhloli
1803384479 feat: update Docker run command in Enflame.md and modify gcu.Dockerfile for package installation improvements 2026-01-23 19:04:17 +08:00
myhloli
583855be67 feat: update Docker run command in Enflame.md and modify gcu.Dockerfile for improved package sources 2026-01-23 18:16:32 +08:00
myhloli
b2e4c047ad feat: update PyYAML version constraint in pyproject.toml 2026-01-23 16:31:18 +08:00
myhloli
c23c26b422 feat: update Docker run command in Enflame.md for new image and privileges 2026-01-23 15:15:33 +08:00
myhloli
75e018573d feat: add Dockerfile for Musa environment with MinerU and dependencies 2026-01-22 21:31:36 +08:00
myhloli
1269707b95 feat: add documentation for MooreThreads support in MinerU 2026-01-22 21:22:12 +08:00
myhloli
dd2929b7e8 feat: refactor compilation_config handling to use CompilationConfig object in vlm_analyze 2026-01-22 19:46:48 +08:00
myhloli
04aceee288 feat: ensure compilation_config is a valid JSON string in vlm_analyze 2026-01-22 19:40:44 +08:00
myhloli
fb046d3a5a feat: update compilation_config to dictionary format in vlm_analyze for MUSA devices 2026-01-22 19:30:51 +08:00
myhloli
1912adae65 refactor: remove JSON parsing for compilation_config in vlm_analyze 2026-01-22 19:27:12 +08:00
myhloli
d39c4b5d64 feat: update compilation configuration format to JSON string in vlm_analyze for MUSA devices 2026-01-22 19:26:43 +08:00
myhloli
d4622caf1a feat: add block size and compilation config arguments for MUSA devices in vllm_server 2026-01-22 18:58:10 +08:00
myhloli
5f7214bf2f feat: integrate cudagraph_capture_sizes into vlm_analyze for MUSA devices 2026-01-22 18:52:53 +08:00
myhloli
294105c1b0 feat: add cudagraph_capture_sizes to compilation configuration for MUSA devices 2026-01-22 18:40:22 +08:00
myhloli
e8548eddde fix: correct boolean value casing in compilation configuration for MUSA devices 2026-01-22 18:30:31 +08:00
myhloli
5bdf4ce86a feat: add JSON import for compilation configuration in vlm_analyze 2026-01-22 18:28:15 +08:00
myhloli
b9465238f5 feat: add compilation configuration support for MUSA devices in utils and vlm_analyze 2026-01-22 18:21:04 +08:00
myhloli
ffecb89e33 feat: add support for MUSA devices in Unimernet model initialization 2026-01-22 16:14:52 +08:00
myhloli
6a75b39940 feat: add support for MUSA and NPU devices in device management functions 2026-01-22 15:45:26 +08:00
myhloli
313ec8afa0 feat: add MinerU configuration options for vlm/hybrid backend 2026-01-21 11:16:29 +08:00
myhloli
940289d083 feat: add Enflame setup guide and Dockerfile for GCU support 2026-01-20 18:55:29 +08:00
myhloli
21c9267a93 feat: enhance memory management for GCU device support in clean_memory function 2026-01-20 17:34:27 +08:00
myhloli
10d996d14e feat: add 2.7.2 release notes with cross-page table merging optimization and new device support 2026-01-20 17:26:21 +08:00
myhloli
1d697c20bf feat: update region area calculation for compatibility with skimage version 0.26.0 2026-01-20 17:07:53 +08:00
Xiaomeng Zhao
333f6d3a32 Merge branch 'opendatalab:dev' into dev 2026-01-20 16:26:22 +08:00
myhloli
7620bd4ccc feat: add GCU device support for bf16 and memory calculations 2026-01-20 16:06:20 +08:00
myhloli
5706011633 feat: update device handling in YOLO model initialization for improved compatibility 2026-01-19 15:58:10 +08:00
myhloli
df07baea6c feat: enhance table merging logic with effective column calculations and visual consistency checks 2026-01-16 18:58:10 +08:00
myhloli
c73c1d3847 feat: add support for Chinese continuation marker in table merging logic 2026-01-16 17:17:47 +08:00
Xiaomeng Zhao
c6543b4aeb Merge pull request #4368 from myhloli/dev
feat: replace Gradio app script with iframe for improved integration
2026-01-14 17:01:59 +08:00
myhloli
5116192d32 feat: replace Gradio app script with iframe for improved integration 2026-01-14 17:01:11 +08:00
Xiaomeng Zhao
201ba86072 Merge pull request #4365 from tommygood/docs/fix-typos-spans-pdf
Docs: correct file naming format to use '_span.pdf'
2026-01-14 16:44:38 +08:00
tommygood
11252a5636 docs: correct file naming format to use '_span.pdf' 2026-01-14 16:02:38 +08:00
Xiaomeng Zhao
03698c656e Merge pull request #4358 from myhloli/dev
feat: add Hygon entry to acceleration cards list
2026-01-13 19:50:19 +08:00
myhloli
48ded6b06c feat: add Hygon entry to acceleration cards list 2026-01-13 19:47:08 +08:00
Xiaomeng Zhao
4e66217909 Merge pull request #4357 from myhloli/dev
Dev
2026-01-13 19:46:07 +08:00
myhloli
bdec40487e feat: add Dockerfile for vLLM inference environment and Hygon platform documentation 2026-01-13 19:45:18 +08:00
myhloli
ec9b05003d fix: add support for ellipsis continuation marker in table merging logic 2026-01-13 15:50:15 +08:00
Xiaomeng Zhao
c10f248721 Merge pull request #4330 from opendatalab/master
master->dev
2026-01-09 12:04:07 +08:00
Xiaomeng Zhao
30c5d10e05 Archive MinerU Project List and update notes
Updated README to indicate the project is archived and added a note about community contributions.
2026-01-09 12:02:21 +08:00
Xiaomeng Zhao
4c3be9273c Fix typo in README_zh-CN.md 2026-01-09 12:00:26 +08:00
Xiaomeng Zhao
1833163b97 Mark MinerU project as archived in README
Updated README to indicate the project is archived and added a note about community contributions.
2026-01-09 12:00:10 +08:00
Xiaomeng Zhao
eb55029adf Merge pull request #4318 from myhloli/dev
Dev
2026-01-07 20:27:38 +08:00
myhloli
2eef53a9f0 fix: improve continuation marker handling in table caption merging logic 2026-01-07 20:17:13 +08:00
myhloli
9e6e2bde85 fix: refine caption merging logic to improve handling of continuation markers 2026-01-07 20:05:26 +08:00
myhloli
c73e93bec0 fix: enhance table merging logic to handle footnotes more effectively 2026-01-07 19:50:46 +08:00
Xiaomeng Zhao
1dfbea157a Merge pull request #4306 from opendatalab/master
master->dev
2026-01-06 15:03:27 +08:00
myhloli
96840733c4 Update version.py with new version 2026-01-06 06:55:29 +00:00
Xiaomeng Zhao
45f8ad1d5c Merge pull request #4305 from opendatalab/release-2.7.1
Release 2.7.1
2026-01-06 14:47:23 +08:00
Xiaomeng Zhao
b69191ba2b Merge pull request #4304 from opendatalab/dev
Dev
2026-01-06 14:46:18 +08:00
Xiaomeng Zhao
0028514ced Merge pull request #4303 from myhloli/dev
Dev
2026-01-06 14:45:35 +08:00
myhloli
8d8daf6851 fix: add qwen-vl-utils dependency to pyproject.toml 2026-01-06 14:44:53 +08:00
myhloli
815280dd23 fix: update pdfminer.six dependency to resolve CVE-2025-64512 and improve EXIF handling 2026-01-06 14:42:48 +08:00
myhloli
7b52f92aea fix: update pdfminer.six dependency to resolve CVE-2025-64512 and improve EXIF handling 2026-01-06 14:41:47 +08:00
Xiaomeng Zhao
33543b76c9 Merge pull request #4301 from myhloli/dev
Dev
2026-01-06 14:10:08 +08:00
myhloli
ea5f8e98dd fix: update pdfminer.six version to 20251230 in pyproject.toml 2026-01-06 11:54:17 +08:00
myhloli
8996e06448 fix: restore hybrid analyze imports in common.py for backend processing 2026-01-06 11:51:31 +08:00
myhloli
bfb304ef1f fix: improve EXIF handling and save PDF logic in pdf_image_tools.py 2026-01-05 00:27:01 +08:00
30 changed files with 846 additions and 66 deletions

View File

@@ -45,6 +45,22 @@
# Changelog
- 2026/01/23 2.7.2 Release
- Added support for domestic computing platforms Hygon, Enflame, and Moore Threads. Currently, the officially supported domestic computing platforms include:
- [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend/)
- [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead/)
- [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX/)
- [Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
- [Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
- [MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
- MinerU continues to ensure compatibility with domestic hardware platforms, supporting mainstream chip architectures. With secure and reliable technology, we empower researchers, government, and enterprises to reach new heights in document digitization!
- Cross-page table merging optimization, improving merge success rate and merge quality
- 2026/01/06 2.7.1 Release
- fix bug: #4300
- Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
- Support automatic correction of input image exif orientation to improve OCR recognition accuracy #4283
- 2025/12/30 2.7.0 Release
- Simplified installation process. No need to separately install `vlm` acceleration engine dependencies. Using `uv pip install mineru[all]` during installation will install all optional backend dependencies.
- Added new `hybrid` backend, which combines the advantages of `pipeline` and `vlm` backends. Built on vlm, it integrates some capabilities of pipeline, adding extra extensibility on top of high accuracy:

View File

@@ -45,6 +45,22 @@
# 更新记录
- 2026/01/23 2.7.2 发布
- 新增国产算力平台海光、燧原、摩尔线程的适配支持,目前已由官方适配并支持的国产算力平台包括:
- [昇腾 Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend)
- [平头哥 T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead)
- [沐曦 METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX)
- [海光 DCU](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
- [燧原 GCU](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
- [摩尔线程 MUSA](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
- MinerU 持续兼容国产硬件平台,支持主流芯片架构。以安全可靠的技术,助力科研、政企用户迈向文档数字化新高度!
- 跨页表合并优化,提升合并成功率与合并效果
- 2026/01/06 2.7.1 发布
- fix bug: #4300
- 更新pdfminer.six的依赖版本以解决 [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
- 支持输入图像的exif方向自动校正提升OCR识别效果 #4283
- 2025/12/30 2.7.0 发布
- 简化安装流程,现在不再需要单独安装`vlm`加速引擎依赖包,安装时使用`uv pip install mineru[all]`即可安装所有可选后端的依赖包。
- 增加全新后端`hybrid`,该后端结合了`pipeline``vlm`后端的优势在vlm的基础上融入了pipeline的部分能力在高精度的基础上增加了额外的扩展性

View File

@@ -0,0 +1,34 @@
# Base image containing the vLLM inference environment, requiring amd64(x86-64) CPU + Hygon DCU.
FROM harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.2-1226-das1.7-py3.10-20251226
# Install libgl for opencv support & Noto fonts for Chinese characters
RUN apt-get update && \
apt-get install -y \
fonts-noto-core \
fonts-noto-cjk \
fontconfig && \
fc-cache -fv && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install mineru latest
RUN python3 -m pip install -U pip -i https://mirrors.aliyun.com/pypi/simple && \
python3 -m pip install mineru[api,gradio] \
"matplotlib>=3.10,<4" \
"ultralytics>=8.3.48,<9" \
"doclayout_yolo==0.0.4" \
"ftfy>=6.3.1,<7" \
"shapely>=2.0.7,<3" \
"pyclipper>=1.3.0,<2" \
"omegaconf>=2.3.0,<3" \
numpy==1.25.0 \
opencv-python==4.11.0.86 \
-i https://mirrors.aliyun.com/pypi/simple && \
python3 -m pip cache purge
# Download models and update the configuration file
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]

View File

@@ -0,0 +1,30 @@
# Base image containing the vLLM inference environment, requiring amd64(x86-64) CPU + Enflame GCU.
FROM crpi-vofi3w62lkohhxsp.cn-shanghai.personal.cr.aliyuncs.com/opendatalab-mineru/gcu:docker_images_topsrider_i3x_3.6.20260106_vllm0.11_pytorch2.8.0
# Install libgl for opencv support & Noto fonts for Chinese characters
RUN echo 'deb http://mirrors.aliyun.com/ubuntu/ noble main restricted universe multiverse\n\
deb http://mirrors.aliyun.com/ubuntu/ noble-updates main restricted universe multiverse\n\
deb http://mirrors.aliyun.com/ubuntu/ noble-backports main restricted universe multiverse\n\
deb http://mirrors.aliyun.com/ubuntu/ noble-security main restricted universe multiverse' > /tmp/aliyun-sources.list && \
apt-get -o Dir::Etc::SourceList=/tmp/aliyun-sources.list update && \
apt-get -o Dir::Etc::SourceList=/tmp/aliyun-sources.list install -y \
fonts-noto-core \
fonts-noto-cjk \
fontconfig && \
fc-cache -fv && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/aliyun-sources.list
# Install mineru latest
RUN python3 -m pip install "mineru[core]>=2.7.2" \
numpy==1.26.4 \
opencv-python==4.11.0.86 \
-i https://mirrors.aliyun.com/pypi/simple && \
python3 -m pip cache purge
# Download models and update the configuration file
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]

View File

@@ -0,0 +1,38 @@
# Base image containing the vLLM inference environment, requiring amd64(x86-64) CPU + MooreThreads GPU.
FROM registry.mthreads.com/mcconline/vllm-musa-qy2-py310:v0.8.4-release
# Install libgl for opencv support & Noto fonts for Chinese characters
RUN apt-get update && \
apt-get install -y \
fonts-noto-core \
fonts-noto-cjk \
fontconfig \
libgl1 && \
fc-cache -fv && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install mineru latest
RUN python3 -m pip install -U pip -i https://mirrors.aliyun.com/pypi/simple && \
git clone https://gitcode.com/gh_mirrors/vi/vision.git -b v0.20.0 --depth 1 && \
cd vision && \
python3 setup.py install && \
python3 -m pip install "mineru[api,gradio]>=2.7.2" \
"matplotlib>=3.10,<4" \
"ultralytics>=8.3.48,<9" \
"doclayout_yolo==0.0.4" \
"ftfy>=6.3.1,<7" \
"shapely>=2.0.7,<3" \
"pyclipper>=1.3.0,<2" \
"omegaconf>=2.3.0,<3" \
numpy==1.26.4 \
opencv-python==4.11.0.86 \
-i https://mirrors.aliyun.com/pypi/simple && \
python3 -m pip cache purge
# Download models and update the configuration file
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]

View File

@@ -1,2 +1 @@
<script type="module" src="https://gradio.s3-us-west-2.amazonaws.com/5.35.0/gradio.js"></script>
<gradio-app src="https://opendatalab-mineru.hf.space"></gradio-app>
<iframe src="https://opendatalab-mineru.hf.space" style="min-height: calc(-40px + 100vh); width: 100%; flex-grow: 1; border: medium; overflow: auto; height: 1746px;"></iframe>

View File

@@ -29,12 +29,12 @@ The following sections provide detailed descriptions of each file's purpose and
![layout page example](../images/layout_example.png)
### Text Spans File (spans.pdf)
### Text Spans File (span.pdf)
> [!NOTE]
> Only applicable to pipeline backend
**File naming format**: `{original_filename}_spans.pdf`
**File naming format**: `{original_filename}_span.pdf`
**Functionality**:
@@ -702,7 +702,7 @@ The above files constitute MinerU's complete output results. Users can choose ap
- **Debugging and verification** (Use visualization files):
* layout.pdf
* spans.pdf
* span.pdf
- **Content extraction**: (Use simplified files):
* *.md

View File

@@ -125,3 +125,9 @@ Here are the environment variables and their descriptions:
- `MINERU_HYBRID_FORCE_PIPELINE_ENABLE`:
* Used to force the text extraction part in `hybrid-*` backends to be processed using small models.
* Defaults to `false`. Can be set to `true` via environment variable to enable this feature, thereby reducing hallucinations in certain extreme cases.
- `MINERU_VL_MODEL_NAME`:
* Used to specify the model name for the vlm/hybrid backend, allowing you to designate the model required for MinerU to run when multiple models exist on a remote openai-server.
- `MINERU_VL_API_KEY`:
* Used to specify the API Key for the vlm/hybrid backend, enabling authentication on the remote openai-server.

View File

@@ -29,12 +29,12 @@
![layout 页面示例](../images/layout_example.png)
### 文本片段文件 (spans.pdf)
### 文本片段文件 (span.pdf)
> [!NOTE]
> 仅适用于 pipeline 后端
**文件命名格式**`{原文件名}_spans.pdf`
**文件命名格式**`{原文件名}_span.pdf`
**功能说明**
@@ -817,7 +817,7 @@ vlm 后端的 content_list.json 文件结构与 pipeline 后端类似,伴随
- **调试和验证**(使用可视化文件):
* layout.pdf
* spans.pdf
* span.pdf
- **内容提取**(使用简化文件):
* *.md

View File

@@ -0,0 +1,109 @@
## 1. 测试平台
以下为本指南测试使用的平台信息,供参考:
```
os: Ubuntu 22.04.4 LTS
cpu: Intel x86-64
gcu: Enflame S60
driver: 1.7.0.9
docker: 28.0.1
```
## 2. 环境准备
### 2.1 使用 Dockerfile 构建镜像
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/gcu.Dockerfile
docker build --network=host -t mineru:gcu-vllm-latest -f gcu.Dockerfile .
```
## 3. 启动 Docker 容器
```bash
docker run -u root --name mineru_docker \
--network=host \
--ipc=host \
--privileged \
-e MINERU_MODEL_SOURCE=local \
-it mineru:gcu-vllm-latest \
/bin/bash
```
执行该命令后您将进入到Docker容器的交互式终端您可以直接在容器内运行MinerU相关命令来使用MinerU的功能。
您也可以直接通过替换`/bin/bash`为服务启动命令来启动MinerU服务详细说明请参考[通过命令启动服务](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#apiwebuihttp-clientserver)。
## 4. 注意事项
不同环境下MinerU对Enflame加速卡的支持情况如下表所示
<table border="1">
<thead>
<tr>
<th rowspan="2" colspan="2">使用场景</th>
<th colspan="2">容器环境</th>
</tr>
<tr>
<th>vllm</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="3">命令行工具(mineru)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td rowspan="3">fastapi服务(mineru-api)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td rowspan="3">gradio界面(mineru-gradio)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td colspan="2">openai-server服务mineru-openai-server</td>
<td>🟢</td>
</tr>
<tr>
<td colspan="2">数据并行 (--data-parallel-size)</td>
<td>🔴</td>
</tr>
</tbody>
</table>
注:
🟢: 支持运行较稳定精度与Nvidia GPU基本一致
🟡: 支持但较不稳定,在某些场景下可能出现异常,或精度存在一定差异
🔴: 不支持,无法运行,或精度存在较大差异
>[!TIP]
>GCU加速卡指定可用加速卡的方式与NVIDIA GPU类似请参考[使用指定GPU设备](https://opendatalab.github.io/MinerU/zh/usage/advanced_cli_parameters/#cuda_visible_devices)章节说明,
>将环境变量`CUDA_VISIBLE_DEVICES`替换为`TOPS_VISIBLE_DEVICES`即可。

View File

@@ -0,0 +1,115 @@
## 1. 测试平台
以下为本指南测试使用的平台信息,供参考:
```
os: Ubuntu 22.04.3 LTS
cpu: Hygon Hygon C86-4G(x86-64)
dcu: BW200
driver: 6.3.13-V1.12.0a
docker: 20.10.24
```
## 2. 环境准备
### 2.1 使用 Dockerfile 构建镜像
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/dcu.Dockerfile
docker build --network=host -t mineru:dcu-vllm-latest -f dcu.Dockerfile .
```
## 3. 启动 Docker 容器
```bash
docker run -u root --name mineru_docker \
--network=host \
--ipc=host \
--shm-size=16G \
--device=/dev/kfd \
--device=/dev/mkfd \
--device=/dev/dri \
-v /opt/hyhal:/opt/hyhal \
--group-add video \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
-e MINERU_MODEL_SOURCE=local \
-it mineru:dcu-vllm-latest \
/bin/bash
```
执行该命令后您将进入到Docker容器的交互式终端您可以直接在容器内运行MinerU相关命令来使用MinerU的功能。
您也可以直接通过替换`/bin/bash`为服务启动命令来启动MinerU服务详细说明请参考[通过命令启动服务](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#apiwebuihttp-clientserver)。
## 4. 注意事项
不同环境下MinerU对Hygon加速卡的支持情况如下表所示
<table border="1">
<thead>
<tr>
<th rowspan="2" colspan="2">使用场景</th>
<th colspan="2">容器环境</th>
</tr>
<tr>
<th>vllm</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="3">命令行工具(mineru)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td rowspan="3">fastapi服务(mineru-api)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td rowspan="3">gradio界面(mineru-gradio)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td colspan="2">openai-server服务mineru-openai-server</td>
<td>🟢</td>
</tr>
<tr>
<td colspan="2">数据并行 (--data-parallel-size)</td>
<td>🟢</td>
</tr>
</tbody>
</table>
注:
🟢: 支持运行较稳定精度与Nvidia GPU基本一致
🟡: 支持但较不稳定,在某些场景下可能出现异常,或精度存在一定差异
🔴: 不支持,无法运行,或精度存在较大差异
>[!TIP]
>DCU加速卡指定可用加速卡的方式与AMD GPU类似请参考[GPU isolation techniques](https://rocm.docs.amd.com/en/docs-6.2.4/conceptual/gpu-isolation.html)

View File

@@ -0,0 +1,115 @@
## 1. 测试平台
以下为本指南测试使用的平台信息,供参考:
```
os: Ubuntu 22.04.4 LTS
cpu: Intel x86-64
dcu: MTT S4000
driver: 3.0.0-rc-KuaE2.0
docker: 24.0.7
```
## 2. 环境准备
### 2.1 使用 Dockerfile 构建镜像
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/musa.Dockerfile
docker build --network=host -t mineru:musa-vllm-latest -f musa.Dockerfile .
```
## 3. 启动 Docker 容器
```bash
docker run -u root --name mineru_docker \
--network=host \
--ipc=host \
--shm-size=80g \
--privileged \
-e MTHREADS_VISIBLE_DEVICES=all \
-e MINERU_MODEL_SOURCE=local \
-it mineru:musa-vllm-latest \
/bin/bash
```
执行该命令后您将进入到Docker容器的交互式终端您可以直接在容器内运行MinerU相关命令来使用MinerU的功能。
您也可以直接通过替换`/bin/bash`为服务启动命令来启动MinerU服务详细说明请参考[通过命令启动服务](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#apiwebuihttp-clientserver)。
## 4. 注意事项
不同环境下MinerU对MooreThreads加速卡的支持情况如下表所示
>[!NOTE]
> **兼容性说明**由于摩尔线程MooreThreads目前对 vLLM v1 引擎的支持尚待完善MinerU 现阶段采用 v0 引擎作为适配方案。
> 受此限制vLLM 的异步引擎Async Engine功能存在兼容性问题可能导致部分使用场景无法正常运行。
> 我们将持续跟进摩尔线程对 vLLM v1 引擎的支持进展,并及时在 MinerU 中进行相应的适配与优化。
<table border="1">
<thead>
<tr>
<th rowspan="2" colspan="2">使用场景</th>
<th colspan="2">容器环境</th>
</tr>
<tr>
<th>vllm</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="3">命令行工具(mineru)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td rowspan="3">fastapi服务(mineru-api)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🔴</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td rowspan="3">gradio界面(mineru-gradio)</td>
<td>pipeline</td>
<td>🟢</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-auto-engine</td>
<td>🔴</td>
</tr>
<tr>
<td>&lt;vlm/hybrid&gt;-http-client</td>
<td>🟢</td>
</tr>
<tr>
<td colspan="2">openai-server服务mineru-openai-server</td>
<td>🟢</td>
</tr>
<tr>
<td colspan="2">数据并行 (--data-parallel-size)</td>
<td>🔴</td>
</tr>
</tbody>
</table>
注:
🟢: 支持运行较稳定精度与Nvidia GPU基本一致
🟡: 支持但较不稳定,在某些场景下可能出现异常,或精度存在一定差异
🔴: 不支持,无法运行,或精度存在较大差异
>[!TIP]
>MooreThreads加速卡指定可用加速卡的方式与NVIDIA GPU类似请参考[GPU 枚举](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/install_guide/#gpu-%E6%9E%9A%E4%B8%BE)

View File

@@ -119,4 +119,10 @@ MinerU命令行工具的某些参数存在相同功能的环境变量配置
- `MINERU_HYBRID_FORCE_PIPELINE_ENABLE`
* 用于强制将 hybrid-* 后端中的 文本提取部分使用 小模型 进行处理
* 默认为`false`,可通过环境变量设置为`true`来启用该功能,从而在某些极端情况下减少幻觉的发生。
* 默认为`false`,可通过环境变量设置为`true`来启用该功能,从而在某些极端情况下减少幻觉的发生。
- `MINERU_VL_MODEL_NAME`
* 用于指定 vlm/hybrid 后端使用的模型名称这将允许您在同时存在多个模型的远程openai-server中指定 MinerU 运行所需的模型。
- `MINERU_VL_API_KEY`:
* 用于指定 vlm/hybrid 后端使用的API Key这将允许您在远程openai-server中进行身份验证。

View File

@@ -12,6 +12,9 @@
* [昇腾 Ascend](acceleration_cards/Ascend.md) 🚀
* [平头哥 T-Head](acceleration_cards/THead.md) 🚀
* [沐曦 METAX](acceleration_cards/METAX.md) 🚀
* [海光 Hygon](acceleration_cards/Hygon.md) 🚀
* [燧原 Enflame](acceleration_cards/Enflame.md) 🚀
* [摩尔线程 MooreThreads](acceleration_cards/MooreThreads.md) 🚀
* [AMD](acceleration_cards/AMD.md) [#3662](https://github.com/opendatalab/MinerU/discussions/3662) ❤️
* [太初元碁 Tecorigin](acceleration_cards/Tecorigin.md) [#3767](https://github.com/opendatalab/MinerU/pull/3767) ❤️
* [寒武纪 Cambricon](acceleration_cards/Cambricon.md) [#4004](https://github.com/opendatalab/MinerU/discussions/4004) ❤️

View File

@@ -18,6 +18,10 @@ def enable_custom_logits_processors() -> bool:
compute_capability = f"{major}.{minor}"
elif hasattr(torch, 'npu') and torch.npu.is_available():
compute_capability = "8.0"
elif hasattr(torch, 'gcu') and torch.gcu.is_available():
compute_capability = "8.0"
elif hasattr(torch, 'musa') and torch.musa.is_available():
compute_capability = "8.0"
else:
logger.info("CUDA not available, disabling custom_logits_processors")
return False

View File

@@ -1,6 +1,7 @@
# Copyright (c) Opendatalab. All rights reserved.
import os
import time
import json
from loguru import logger
@@ -99,6 +100,30 @@ class ModelSingleton:
import vllm
except ImportError:
raise ImportError("Please install vllm to use the vllm-engine backend.")
"""
# musa vllm v1 引擎特殊配置
device = get_device()
if device.startswith("musa"):
import torch
if torch.musa.is_available():
compilation_config = {
"cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
"simple_cuda_graph": True
}
block_size = 32
kwargs["compilation_config"] = compilation_config
kwargs["block_size"] = block_size
"""
if "compilation_config" in kwargs:
if isinstance(kwargs["compilation_config"], str):
try:
kwargs["compilation_config"] = json.loads(kwargs["compilation_config"])
except json.JSONDecodeError:
logger.warning(
f"Failed to parse compilation_config as JSON: {kwargs['compilation_config']}")
del kwargs["compilation_config"]
if "gpu_memory_utilization" not in kwargs:
kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
if "model" not in kwargs:
@@ -112,8 +137,38 @@ class ModelSingleton:
try:
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.config import CompilationConfig
except ImportError:
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
"""
# musa vllm v1 引擎特殊配置
device = get_device()
if device.startswith("musa"):
import torch
if torch.musa.is_available():
compilation_config = CompilationConfig(
cudagraph_capture_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
simple_cuda_graph=True
)
block_size = 32
kwargs["compilation_config"] = compilation_config
kwargs["block_size"] = block_size
"""
if "compilation_config" in kwargs:
if isinstance(kwargs["compilation_config"], dict):
# 如果是字典,转换为 CompilationConfig 对象
kwargs["compilation_config"] = CompilationConfig(**kwargs["compilation_config"])
elif isinstance(kwargs["compilation_config"], str):
# 如果是 JSON 字符串,先解析再转换
try:
config_dict = json.loads(kwargs["compilation_config"])
kwargs["compilation_config"] = CompilationConfig(**config_dict)
except (json.JSONDecodeError, TypeError) as e:
logger.warning(
f"Failed to parse compilation_config: {kwargs['compilation_config']}, error: {e}")
del kwargs["compilation_config"]
if "gpu_memory_utilization" not in kwargs:
kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
if "model" not in kwargs:

View File

@@ -17,8 +17,6 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
from mineru.utils.pdf_page_id import get_end_page_id
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
@@ -326,6 +324,7 @@ def _process_hybrid(
server_url=None,
**kwargs,
):
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
"""同步处理hybrid后端逻辑"""
if not backend.endswith("client"):
server_url = None
@@ -378,8 +377,8 @@ async def _async_process_hybrid(
server_url=None,
**kwargs,
):
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
"""异步处理hybrid后端逻辑"""
if not backend.endswith("client"):
server_url = None

View File

@@ -1,5 +1,7 @@
import os
from typing import List, Union
import torch
from tqdm import tqdm
from ultralytics import YOLO
import numpy as np
@@ -18,8 +20,8 @@ class YOLOv8MFDModel:
conf: float = 0.25,
iou: float = 0.45,
):
self.model = YOLO(weight).to(device)
self.device = device
self.device = torch.device(device)
self.model = YOLO(weight).to(self.device)
self.imgsz = imgsz
self.conf = conf
self.iou = iou

View File

@@ -23,12 +23,12 @@ class MathDataset(Dataset):
class UnimernetModel(object):
def __init__(self, weight_dir, _device_="cpu"):
from .unimernet_hf import UnimernetModel
if _device_.startswith("mps") or _device_.startswith("npu"):
if _device_.startswith("mps") or _device_.startswith("npu") or _device_.startswith("musa"):
self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
else:
self.model = UnimernetModel.from_pretrained(weight_dir)
self.device = _device_
self.model.to(_device_)
self.device = torch.device(_device_)
self.model.to(self.device)
if not _device_.startswith("cpu"):
self.model = self.model.to(dtype=torch.float16)
self.model.eval()

View File

@@ -4,6 +4,8 @@ import cv2
import numpy as np
from scipy.spatial import distance as dist
from skimage import measure
from skimage import __version__ as skimage_version
from packaging import version
def transform_preds(coords, center, scale, output_size, rot=0):
@@ -295,7 +297,11 @@ def min_area_rect_box(
"""
boxes = []
for region in regions:
if region.bbox_area > H * W * 3 / 4: # 过滤大的单元格
if version.parse(skimage_version) >= version.parse("0.26.0"):
region_bbox_area = region.area_bbox
else:
region_bbox_area = region.bbox_area
if region_bbox_area > H * W * 3 / 4: # 过滤大的单元格
continue
rect = cv2.minAreaRect(region.coords[:, ::-1])

View File

@@ -2,6 +2,7 @@ import os
import sys
from mineru.backend.vlm.utils import set_default_gpu_memory_utilization, enable_custom_logits_processors
from mineru.utils.config_reader import get_device
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
from vllm.entrypoints.cli.main import main as vllm_main
@@ -13,6 +14,8 @@ def main():
has_port_arg = False
has_gpu_memory_utilization_arg = False
has_logits_processors_arg = False
has_block_size_arg = False
has_compilation_config = False
model_path = None
model_arg_indices = []
@@ -24,6 +27,10 @@ def main():
has_gpu_memory_utilization_arg = True
if arg == "--logits-processors" or arg.startswith("--logits-processors="):
has_logits_processors_arg = True
if arg == "--block-size" or arg.startswith("--block-size="):
has_block_size_arg = True
if arg == "--compilation-config" or arg.startswith("--compilation-config="):
has_compilation_config = True
if arg == "--model":
if i + 1 < len(args):
model_path = args[i + 1]
@@ -49,6 +56,17 @@ def main():
model_path = auto_download_and_get_model_root_path("/", "vlm")
if (not has_logits_processors_arg) and custom_logits_processors:
args.extend(["--logits-processors", "mineru_vl_utils:MinerULogitsProcessor"])
"""
# musa vllm v1 引擎特殊配置
device = get_device()
if device.startswith("musa"):
import torch
if torch.musa.is_available():
if not has_block_size_arg:
args.extend(["--block-size", "32"])
if not has_compilation_config:
args.extend(["--compilation-config", '{"cudagraph_capture_sizes": [1,2,3,4,5,6,7,8,10,12,14,16,18,20,24,28,30], "simple_cuda_graph": true}'])
"""
# 重构参数,将模型路径作为位置参数
sys.argv = [sys.argv[0]] + ["serve", model_path] + args

View File

@@ -186,6 +186,18 @@ def model_init(model_name: str):
bf_16_support = True
elif device_name.startswith("mps"):
bf_16_support = True
elif device_name.startswith("gcu"):
if hasattr(torch, 'gcu') and torch.gcu.is_available():
if torch.gcu.is_bf16_supported():
bf_16_support = True
elif device_name.startswith("musa"):
if hasattr(torch, 'musa') and torch.musa.is_available():
if torch.musa.is_bf16_supported():
bf_16_support = True
elif device_name.startswith("npu"):
if hasattr(torch, 'npu') and torch.npu.is_available():
if torch.npu.is_bf16_supported():
bf_16_support = True
if model_name == 'layoutreader':
# 检测modelscope的缓存目录是否存在

View File

@@ -86,7 +86,15 @@ def get_device():
if torch_npu.npu.is_available():
return "npu"
except Exception as e:
pass
try:
if torch.gcu.is_available():
return "gcu"
except Exception as e:
try:
if torch.musa.is_available():
return "musa"
except Exception as e:
pass
return "cpu"

View File

@@ -414,7 +414,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
def clean_memory(device='cuda'):
if device == 'cuda':
if str(device).startswith("cuda"):
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
@@ -423,6 +423,12 @@ def clean_memory(device='cuda'):
torch_npu.npu.empty_cache()
elif str(device).startswith("mps"):
torch.mps.empty_cache()
elif str(device).startswith("gcu"):
if torch.gcu.is_available():
torch.gcu.empty_cache()
elif str(device).startswith("musa"):
if torch.musa.is_available():
torch.musa.empty_cache()
gc.collect()
@@ -458,5 +464,11 @@ def get_vram(device) -> int:
elif str(device).startswith("npu"):
if torch_npu.npu.is_available():
total_memory = round(torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
elif str(device).startswith("gcu"):
if torch.gcu.is_available():
total_memory = round(torch.gcu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
elif str(device).startswith("musa"):
if torch.musa.is_available():
total_memory = round(torch.musa.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
return total_memory

View File

@@ -232,13 +232,17 @@ def images_bytes_to_pdf_bytes(image_bytes):
# 载入并转换所有图像为 RGB 模式
image = Image.open(BytesIO(image_bytes))
# 根据 EXIF 信息自动转正(处理手机拍摄的带 Orientation 标记的图片)
ImageOps.exif_transpose(image, in_place=True)
image = ImageOps.exif_transpose(image) or image
# 只在必要时转换
if image.mode != "RGB":
image = image.convert("RGB")
# 第一张图保存为 PDF其余追加
image.save(pdf_buffer, format="PDF", save_all=True)
image.save(
pdf_buffer,
format="PDF",
# save_all=True
)
# 获取 PDF bytes 并重置指针(可选)
pdf_bytes = pdf_buffer.getvalue()

View File

@@ -9,13 +9,19 @@ from mineru.utils.char_utils import full_to_half
from mineru.utils.enum_class import BlockType, SplitFlag
CONTINUATION_MARKERS = [
CONTINUATION_END_MARKERS = [
"(续)",
"(续表)",
"(续上表)",
"(continued)",
"(cont.)",
"(contd)",
"(…continued)",
"续表",
]
CONTINUATION_INLINE_MARKERS = [
"(continued)",
]
@@ -64,6 +70,69 @@ def calculate_table_total_columns(soup):
return max_cols
def build_table_occupied_matrix(soup):
"""构建表格的占用矩阵,返回每行的有效列数
Args:
soup: BeautifulSoup解析的表格
Returns:
dict: {row_idx: effective_columns} 每行的有效列数考虑rowspan占用
"""
rows = soup.find_all("tr")
if not rows:
return {}
occupied = {} # {row_idx: {col_idx: True}}
row_effective_cols = {} # {row_idx: effective_columns}
for row_idx, row in enumerate(rows):
col_idx = 0
cells = row.find_all(["td", "th"])
if row_idx not in occupied:
occupied[row_idx] = {}
for cell in cells:
# 找到下一个未被占用的列位置
while col_idx in occupied[row_idx]:
col_idx += 1
colspan = int(cell.get("colspan", 1))
rowspan = int(cell.get("rowspan", 1))
# 标记被这个单元格占用的所有位置
for r in range(row_idx, row_idx + rowspan):
if r not in occupied:
occupied[r] = {}
for c in range(col_idx, col_idx + colspan):
occupied[r][c] = True
col_idx += colspan
# 该行的有效列数为已占用的最大列索引+1
if occupied[row_idx]:
row_effective_cols[row_idx] = max(occupied[row_idx].keys()) + 1
else:
row_effective_cols[row_idx] = 0
return row_effective_cols
def calculate_row_effective_columns(soup, row_idx):
"""计算指定行的有效列数考虑rowspan占用
Args:
soup: BeautifulSoup解析的表格
row_idx: 行索引
Returns:
int: 该行的有效列数
"""
row_effective_cols = build_table_occupied_matrix(soup)
return row_effective_cols.get(row_idx, 0)
def calculate_row_columns(row):
"""
计算表格行的实际列数考虑colspan属性
@@ -113,6 +182,10 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
rows1 = soup1.find_all("tr")
rows2 = soup2.find_all("tr")
# 构建两个表格的有效列数矩阵
effective_cols1 = build_table_occupied_matrix(soup1)
effective_cols2 = build_table_occupied_matrix(soup2)
min_rows = min(len(rows1), len(rows2), max_header_rows)
header_rows = 0
headers_match = True
@@ -130,20 +203,24 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
if len(cells1) != len(cells2):
structure_match = False
else:
# 然后检查单元格的属性和内容
for cell1, cell2 in zip(cells1, cells2):
colspan1 = int(cell1.get("colspan", 1))
rowspan1 = int(cell1.get("rowspan", 1))
colspan2 = int(cell2.get("colspan", 1))
rowspan2 = int(cell2.get("rowspan", 1))
# 检查有效列数是否一致考虑rowspan影响
if effective_cols1.get(i, 0) != effective_cols2.get(i, 0):
structure_match = False
else:
# 然后检查单元格的属性和内容
for cell1, cell2 in zip(cells1, cells2):
colspan1 = int(cell1.get("colspan", 1))
rowspan1 = int(cell1.get("rowspan", 1))
colspan2 = int(cell2.get("colspan", 1))
rowspan2 = int(cell2.get("rowspan", 1))
# 去除所有空白字符(包括空格、换行、制表符等)
text1 = ''.join(full_to_half(cell1.get_text()).split())
text2 = ''.join(full_to_half(cell2.get_text()).split())
# 去除所有空白字符(包括空格、换行、制表符等)
text1 = ''.join(full_to_half(cell1.get_text()).split())
text2 = ''.join(full_to_half(cell2.get_text()).split())
if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
structure_match = False
break
if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
structure_match = False
break
if structure_match:
header_rows += 1
@@ -153,7 +230,54 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
headers_match = header_rows > 0 # 只有当至少匹配了一行时,才认为表头匹配
break
# 如果没有找到匹配的表头行,则返回失败
# 如果严格匹配失败,尝试视觉一致性匹配(只比较文本内容)
if header_rows == 0:
header_rows, headers_match, header_texts = _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows)
return header_rows, headers_match, header_texts
def _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows=5):
"""
基于视觉一致性检测表头只比较文本内容忽略colspan/rowspan差异
Args:
soup1: 第一个表格的BeautifulSoup对象
soup2: 第二个表格的BeautifulSoup对象
rows1: 第一个表格的行列表
rows2: 第二个表格的行列表
max_header_rows: 最大可能的表头行数
Returns:
tuple: (表头行数, 表头是否一致, 表头文本列表)
"""
# 构建两个表格的有效列数矩阵
effective_cols1 = build_table_occupied_matrix(soup1)
effective_cols2 = build_table_occupied_matrix(soup2)
min_rows = min(len(rows1), len(rows2), max_header_rows)
header_rows = 0
headers_match = True
header_texts = []
for i in range(min_rows):
cells1 = rows1[i].find_all(["td", "th"])
cells2 = rows2[i].find_all(["td", "th"])
# 提取每行的文本内容列表(去除空白字符)
texts1 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells1]
texts2 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells2]
# 检查视觉一致性:文本内容完全相同,且有效列数一致
effective_cols_match = effective_cols1.get(i, 0) == effective_cols2.get(i, 0)
if texts1 == texts2 and effective_cols_match:
header_rows += 1
row_texts = [full_to_half(cell.get_text().strip()) for cell in cells1]
header_texts.append(row_texts)
else:
headers_match = header_rows > 0
break
if header_rows == 0:
headers_match = False
@@ -163,20 +287,32 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
def can_merge_tables(current_table_block, previous_table_block):
"""判断两个表格是否可以合并"""
# 检查表格是否有caption和footnote
# 计算previous_table_block中的footnote数量
footnote_count = sum(1 for block in previous_table_block["blocks"] if block["type"] == BlockType.TABLE_FOOTNOTE)
# 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
if caption_blocks:
# 如果所有caption都不以"(续)"、"(续表)"、"(continued)"或"(cont.)"结尾,则不合并
# 检查是否至少有一个caption包含续表标识
has_continuation_marker = False
for block in caption_blocks:
caption_text = full_to_half(merge_para_with_text(block).strip()).lower()
if (
any(caption_text.endswith(marker.lower()) for marker in CONTINUATION_END_MARKERS)
or any(marker.lower() in caption_text for marker in CONTINUATION_INLINE_MARKERS)
):
has_continuation_marker = True
break
if not any(
any(full_to_half(merge_para_with_text(block).strip()).lower().endswith(marker.lower())
for marker in CONTINUATION_MARKERS)
for block in caption_blocks
):
# 如果所有caption都不包含续表标识则不允许合并
if not has_continuation_marker:
return False, None, None, None, None
if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
return False, None, None, None, None
# 如果current_table_block的caption存在续标识,放宽footnote的限制允许previous_table_block有最多一条footnote
if footnote_count > 1:
return False, None, None, None, None
else:
if footnote_count > 0:
return False, None, None, None, None
# 获取两个表格的HTML内容
current_html = ""
@@ -226,34 +362,44 @@ def check_rows_match(soup1, soup2):
if not (rows1 and rows2):
return False
# 获取第一个表的最后一行数据行
# 获取第一个表的最后一行数据行索引
last_row_idx = None
last_row = None
for row in reversed(rows1):
if row.find_all(["td", "th"]):
last_row = row
for idx in range(len(rows1) - 1, -1, -1):
if rows1[idx].find_all(["td", "th"]):
last_row_idx = idx
last_row = rows1[idx]
break
# 检测表头行数,以便获取第二个表的首个数据行
header_count, _, _ = detect_table_headers(soup1, soup2)
# 获取第二个表的首个数据行
first_data_row_idx = None
first_data_row = None
if len(rows2) > header_count:
first_data_row_idx = header_count
first_data_row = rows2[header_count] # 第一个非表头行
if not (last_row and first_data_row):
return False
# 计算实际列数考虑colspan和视觉列数
# 计算有效列数(考虑rowspan和colspan
last_row_effective_cols = calculate_row_effective_columns(soup1, last_row_idx)
first_row_effective_cols = calculate_row_effective_columns(soup2, first_data_row_idx)
# 计算实际列数仅考虑colspan和视觉列数
last_row_cols = calculate_row_columns(last_row)
first_row_cols = calculate_row_columns(first_data_row)
last_row_visual_cols = calculate_visual_columns(last_row)
first_row_visual_cols = calculate_visual_columns(first_data_row)
# logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(视觉列数:{first_row_visual_cols})")
# logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(有效列数:{last_row_effective_cols}, 视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(有效列数:{first_row_effective_cols}, 视觉列数:{first_row_visual_cols})")
# 同时考虑实际列数匹配和视觉列数匹配
return last_row_cols == first_row_cols or last_row_visual_cols == first_row_visual_cols
# 同时考虑有效列数匹配、实际列数匹配和视觉列数匹配
return (last_row_effective_cols == first_row_effective_cols or
last_row_cols == first_row_cols or
last_row_visual_cols == first_row_visual_cols)
def check_row_columns_match(row1, row2):
@@ -270,12 +416,13 @@ def check_row_columns_match(row1, row2):
return True
def adjust_table_rows_colspan(rows, start_idx, end_idx,
def adjust_table_rows_colspan(soup, rows, start_idx, end_idx,
reference_structure, reference_visual_cols,
target_cols, current_cols, reference_row):
"""调整表格行的colspan属性以匹配目标列数
Args:
soup: BeautifulSoup解析的表格对象用于计算有效列数
rows: 表格行列表
start_idx: 起始行索引
end_idx: 结束行索引(不包含)
@@ -287,14 +434,21 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
"""
reference_row_copy = deepcopy(reference_row)
# 构建有效列数矩阵
effective_cols_matrix = build_table_occupied_matrix(soup)
for i in range(start_idx, end_idx):
row = rows[i]
cells = row.find_all(["td", "th"])
if not cells:
continue
# 使用有效列数考虑rowspan判断是否需要调整
current_row_effective_cols = effective_cols_matrix.get(i, 0)
current_row_cols = calculate_row_columns(row)
if current_row_cols >= target_cols:
# 如果有效列数或实际列数已经达到目标,则跳过
if current_row_effective_cols >= target_cols or current_row_cols >= target_cols:
continue
# 检查是否与参考行结构匹配
@@ -306,9 +460,12 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
cell["colspan"] = str(reference_structure[j])
else:
# 扩展最后一个单元格以填补列数差异
last_cell = cells[-1]
current_last_span = int(last_cell.get("colspan", 1))
last_cell["colspan"] = str(current_last_span + (target_cols - current_cols))
# 使用有效列数来计算差异
cols_diff = target_cols - current_row_effective_cols
if cols_diff > 0:
last_cell = cells[-1]
current_last_span = int(last_cell.get("colspan", 1))
last_cell["colspan"] = str(current_last_span + cols_diff)
def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
@@ -339,7 +496,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
reference_visual_cols = calculate_visual_columns(last_row1)
# 以表1的最后一行为参考调整表2的行
adjust_table_rows_colspan(
rows2, header_count, len(rows2),
soup2, rows2, header_count, len(rows2),
reference_structure, reference_visual_cols,
table_cols1, table_cols2, first_data_row2
)
@@ -349,7 +506,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
reference_visual_cols = calculate_visual_columns(first_data_row2)
# 以表2的第一个数据行为参考调整表1的行
adjust_table_rows_colspan(
rows1, 0, len(rows1),
soup1, rows1, 0, len(rows1),
reference_structure, reference_visual_cols,
table_cols2, table_cols1, last_row1
)
@@ -363,6 +520,11 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
row.extract()
tbody1.append(row)
# 清空previous_table_block的footnote
previous_table_block["blocks"] = [
block for block in previous_table_block["blocks"]
if block["type"] != BlockType.TABLE_FOOTNOTE
]
# 添加待合并表格的footnote到前一个表格中
for table_footnote in wait_merge_table_footnotes:
temp_table_footnote = table_footnote.copy()
@@ -423,4 +585,4 @@ def merge_table(page_info_list):
# 删除当前页的table
for block in current_table_block["blocks"]:
block['lines'] = []
block[SplitFlag.LINES_DELETED] = True
block[SplitFlag.LINES_DELETED] = True

View File

@@ -1 +1 @@
__version__ = "2.7.0"
__version__ = "2.7.1"

View File

@@ -1,4 +1,9 @@
# Welcome to the MinerU Project List
# Welcome to the MinerU Project List (Archived)
>[!NOTE]
> All projects in this repository are contributed by community developers. The official team does not provide maintenance or technical support. To consolidate resources, this repository has stopped accepting new project submissions and maintenance requests for existing projects.
> If you have an excellent project based on MinerU that you'd like to share, please submit the project link to the well-maintained community resource repository [awesome-mineru](https://github.com/opendatalab/awesome-mineru).
> Thank you for your support and contribution to the MinerU ecosystem!
## Project List

View File

@@ -1,4 +1,10 @@
# 欢迎来到 MinerU 项目列表
# 欢迎来到 MinerU 项目列表Archived
>[!NOTE]
> 本仓库中所有项目均由社区开发者贡献,官方团队不提供维护与技术支持。
> 为整合资源,本仓库目前已停止接收新项目提交和已有项目的维护请求。
> 如果您有基于 MinerU 的优秀项目希望分享,欢迎将项目链接提交至精心维护的社区资源库 [awesome-mineru](https://github.com/opendatalab/awesome-mineru)。
> 感谢您对 MinerU 生态的支持与贡献!
## 项目列表

View File

@@ -21,7 +21,7 @@ dependencies = [
"click>=8.1.7",
"loguru>=0.7.2",
"numpy>=1.21.6",
"pdfminer.six==20250506",
"pdfminer.six>=20251230",
"tqdm>=4.67.1",
"requests",
"httpx",
@@ -70,7 +70,7 @@ pipeline = [
"ultralytics>=8.3.48,<9",
"doclayout_yolo==0.0.4",
"dill>=0.3.8,<1",
"PyYAML>=6.0.2,<7",
"PyYAML>=6.0.1,<7",
"ftfy>=6.3.1,<7",
"shapely>=2.0.7,<3",
"pyclipper>=1.3.0,<2",
@@ -94,10 +94,10 @@ core = [
"mineru[pipeline]",
"mineru[api]",
"mineru[gradio]",
"mineru[mlx] ; sys_platform == 'darwin'",
]
all = [
"mineru[core]",
"mineru[mlx] ; sys_platform == 'darwin'",
"mineru[vllm] ; sys_platform == 'linux'",
"mineru[lmdeploy] ; sys_platform == 'windows'",
]