mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Merge branch 'master' into release-0.9.0
This commit is contained in:
@@ -44,7 +44,6 @@ RUN /bin/bash -c "wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.tem
|
||||
RUN /bin/bash -c "pip3 install modelscope && \
|
||||
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py && \
|
||||
python3 download_models.py && \
|
||||
sed -i 's|/tmp/models|/root/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models|g' /root/magic-pdf.json && \
|
||||
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
|
||||
|
||||
# Set the entry point to activate the virtual environment and run the command line tool
|
||||
|
||||
@@ -18,7 +18,9 @@
|
||||
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
||||
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
||||
[](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
|
||||
[](https://arxiv.org/pdf/2409.18839?)
|
||||
|
||||
[](https://arxiv.org/abs/2409.18839)
|
||||
|
||||
|
||||
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
|
||||
@@ -226,6 +228,7 @@ You can find the `magic-pdf.json` file in your 【user directory】.
|
||||
|
||||
You can modify certain configurations in this file to enable or disable features, such as table recognition:
|
||||
|
||||
|
||||
> If the following items are not present in the JSON, please manually add the required items and remove the comment content (standard JSON does not support comments).
|
||||
|
||||
```json
|
||||
|
||||
@@ -18,7 +18,8 @@
|
||||
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
||||
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
||||
[](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
|
||||
[](https://arxiv.org/pdf/2409.18839?)
|
||||
[](https://arxiv.org/abs/2409.18839)
|
||||
|
||||
|
||||
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
|
||||
|
||||
@@ -75,8 +75,10 @@ If the version number is less than 0.7.0, please report the issue.
|
||||
|
||||
### 6. Download Models
|
||||
|
||||
|
||||
Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
|
||||
|
||||
|
||||
## 7. Understand the Location of the Configuration File
|
||||
|
||||
After completing the [6. Download Models](#6-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
|
||||
@@ -84,6 +86,7 @@ You can find the `magic-pdf.json` file in your user directory.
|
||||
|
||||
> The user directory for Linux is "/home/username".
|
||||
|
||||
|
||||
### 8. First Run
|
||||
|
||||
Download a sample file from the repository and test it.
|
||||
|
||||
@@ -76,6 +76,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
|
||||
|
||||
## 6. 下载模型
|
||||
|
||||
|
||||
详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
|
||||
|
||||
## 7. 了解配置文件存放的位置
|
||||
@@ -83,6 +84,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
|
||||
完成[6.下载模型](#6-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
|
||||
您可在【用户目录】下找到magic-pdf.json文件。
|
||||
|
||||
|
||||
> linux用户目录为 "/home/用户名"
|
||||
|
||||
## 8. 第一次运行
|
||||
|
||||
@@ -82,6 +82,7 @@ If your graphics card has at least 8GB of VRAM, follow these steps to test CUDA-
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
3. **Run the following command to test CUDA acceleration**:
|
||||
|
||||
```
|
||||
|
||||
@@ -46,6 +46,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
|
||||
完成[5.下载模型](#5-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
|
||||
您可在【用户目录】下找到magic-pdf.json文件。
|
||||
|
||||
|
||||
> windows用户目录为 "C:/Users/用户名"
|
||||
|
||||
## 7. 第一次运行
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
@@ -21,6 +22,7 @@ def download_and_modify_json(url, local_filename, modifications):
|
||||
else:
|
||||
data = download_json(url)
|
||||
|
||||
|
||||
# 修改内容
|
||||
for key, value in modifications.items():
|
||||
data[key] = value
|
||||
@@ -31,6 +33,7 @@ def download_and_modify_json(url, local_filename, modifications):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
mineru_patterns = [
|
||||
"models/Layout/LayoutLMv3/*",
|
||||
"models/Layout/YOLO/*",
|
||||
@@ -48,6 +51,7 @@ if __name__ == '__main__':
|
||||
json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
|
||||
config_file_name = 'magic-pdf.json'
|
||||
home_dir = os.path.expanduser('~')
|
||||
|
||||
config_file = os.path.join(home_dir, config_file_name)
|
||||
|
||||
json_mods = {
|
||||
@@ -56,4 +60,6 @@ if __name__ == '__main__':
|
||||
}
|
||||
|
||||
download_and_modify_json(json_url, config_file, json_mods)
|
||||
|
||||
print(f'The configuration file has been configured successfully, the path is: {config_file}')
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ def download_and_modify_json(url, local_filename, modifications):
|
||||
else:
|
||||
data = download_json(url)
|
||||
|
||||
|
||||
# 修改内容
|
||||
for key, value in modifications.items():
|
||||
data[key] = value
|
||||
@@ -55,6 +56,7 @@ if __name__ == '__main__':
|
||||
json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
|
||||
config_file_name = 'magic-pdf.json'
|
||||
home_dir = os.path.expanduser('~')
|
||||
|
||||
config_file = os.path.join(home_dir, config_file_name)
|
||||
|
||||
json_mods = {
|
||||
@@ -63,4 +65,6 @@ if __name__ == '__main__':
|
||||
}
|
||||
|
||||
download_and_modify_json(json_url, config_file, json_mods)
|
||||
|
||||
print(f'The configuration file has been configured successfully, the path is: {config_file}')
|
||||
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
Model downloads are divided into initial downloads and updates to the model directory. Please refer to the corresponding documentation for instructions on how to proceed.
|
||||
|
||||
|
||||
# Initial download of model files
|
||||
|
||||
### 1. Download the Model from Hugging Face
|
||||
### Download the Model from Hugging Face
|
||||
|
||||
Use a Python Script to Download Model Files from Hugging Face
|
||||
|
||||
```bash
|
||||
pip install huggingface_hub
|
||||
wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
|
||||
python download_models_hf.py
|
||||
```
|
||||
|
||||
The Python script will automatically download the model files and configure the model directory in the configuration file.
|
||||
|
||||
The configuration file can be found in the user directory, with the filename `magic-pdf.json`.
|
||||
|
||||
|
||||
# How to update models previously downloaded
|
||||
|
||||
## 1. Models downloaded via Git LFS
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
<pre><code>pip install huggingface_hub
|
||||
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
|
||||
python download_models_hf.py</code></pre>
|
||||
<p>python脚本会自动下载模型文件并配置好配置文件中的模型目录</p>
|
||||
</details>
|
||||
|
||||
## 方法二:从 ModelScope 下载模型
|
||||
@@ -21,13 +22,13 @@ pip install modelscope
|
||||
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py -O download_models.py
|
||||
python download_models.py
|
||||
```
|
||||
|
||||
python脚本会自动下载模型文件并配置好配置文件中的模型目录
|
||||
|
||||
配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
|
||||
|
||||
> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
|
||||
|
||||
|
||||
# 此前下载过模型,如何更新
|
||||
|
||||
## 1. 通过git lfs下载过模型
|
||||
|
||||
@@ -47,6 +47,30 @@
|
||||
"created_at": "2024-08-26T07:01:49Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 487
|
||||
},
|
||||
{
|
||||
"name": "hamirmahal",
|
||||
"id": 43425812,
|
||||
"comment_id": 2395141155,
|
||||
"created_at": "2024-10-05T18:22:47Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 687
|
||||
},
|
||||
{
|
||||
"name": "wmpscc",
|
||||
"id": 29891793,
|
||||
"comment_id": 2416780426,
|
||||
"created_at": "2024-10-16T13:02:13Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 682
|
||||
},
|
||||
{
|
||||
"name": "randydl",
|
||||
"id": 36127931,
|
||||
"comment_id": 2439668779,
|
||||
"created_at": "2024-10-26T17:39:26Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 793
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user