Compare commits

...

21 Commits

Author SHA1 Message Date
Xiaomeng Zhao
d77d5edddb Merge pull request #2644 from myhloli/dev
fix: update configure_model to use environment variable for config file name
2025-06-14 22:22:20 +08:00
myhloli
a812ae899e Merge remote-tracking branch 'origin/dev' into dev 2025-06-14 22:20:18 +08:00
myhloli
1793bdfc7d fix: update configure_model to use environment variable for config file name 2025-06-14 22:20:07 +08:00
Xiaomeng Zhao
6e54a68cef Merge pull request #2638 from myhloli/dev
fix: add pdftext link to README and README_zh-CN for completeness
2025-06-14 10:27:55 +08:00
myhloli
6e39928204 fix: add pdftext link to README and README_zh-CN for completeness 2025-06-14 10:26:49 +08:00
Xiaomeng Zhao
52537958ec Merge pull request #2635 from myhloli/dev
fix: update Table of Contents in README and README_zh-CN for clarity and consistency
2025-06-13 22:17:26 +08:00
myhloli
d8989ed116 fix: update Table of Contents in README and README_zh-CN for clarity and consistency 2025-06-13 22:15:53 +08:00
Xiaomeng Zhao
9669111faf Merge pull request #2633 from myhloli/dev
fix: revert json_url in configure_model to use original MinerU template
2025-06-13 21:09:23 +08:00
myhloli
fdca2c8ef0 Merge remote-tracking branch 'origin/dev' into dev 2025-06-13 21:08:20 +08:00
myhloli
91208fb1bd fix: revert json_url in configure_model to use original MinerU template 2025-06-13 21:08:11 +08:00
Xiaomeng Zhao
3376f3a7d9 Merge pull request #2632 from opendatalab/master
master->dev
2025-06-13 21:05:32 +08:00
Xiaomeng Zhao
c5480b9d39 Merge pull request #2631 from opendatalab/release-2.0.0
Release 2.0.0
2025-06-13 20:38:26 +08:00
myhloli
97c1362e3c Update version.py with new version 2025-06-13 12:31:41 +00:00
Xiaomeng Zhao
28588d7c65 Merge pull request #2628 from opendatalab/release-2.0.0
Release 2.0.0
2025-06-13 20:29:04 +08:00
Xiaomeng Zhao
6ab123487b Merge pull request #2625 from opendatalab/release-2.0.0
Release 2.0.0
2025-06-13 20:21:52 +08:00
github-actions[bot]
9487d33d7b @YanzhenHuang has signed the CLA in opendatalab/MinerU#2620 2025-06-13 04:22:59 +00:00
github-actions[bot]
46f7e0f532 @AdrianWangs has signed the CLA in opendatalab/MinerU#2578 2025-06-05 11:30:53 +00:00
github-actions[bot]
efba5d4594 @PairZhu has signed the CLA in opendatalab/MinerU#2566 2025-06-04 02:39:52 +00:00
github-actions[bot]
a911c29fbb @liuzhenghua has signed the CLA in opendatalab/MinerU#2550 2025-05-30 02:57:16 +00:00
github-actions[bot]
0ac5623ad6 @seedclaimer has signed the CLA in opendatalab/MinerU#2536 2025-05-28 12:50:37 +00:00
Xiaomeng Zhao
113a3ad91f Create SECURITY.md 2025-05-28 11:22:55 +08:00
6 changed files with 141 additions and 90 deletions

View File

@@ -347,48 +347,38 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
<details>
<summary>2024/07/05 Initial open-source release</summary>
</details>
</details>
<!-- TABLE OF CONTENT -->
<!-- TABLE OF CONTENT -->
<details open="open">
<summary><h2 style="display: inline-block">Table of Contents</h2></summary>
<ol>
<li>
<a href="#mineru">MinerU</a>
<ul>
<li><a href="#project-introduction">Project Introduction</a></li>
<li><a href="#key-features">Key Features</a></li>
<li><a href="#quick-start">Quick Start</a>
<ul>
<li><a href="#online-demo">Online Demo</a></li>
<li><a href="#quick-cpu-demo">Quick CPU Demo</a></li>
<li><a href="#using-gpu">Using GPU</a></li>
<li><a href="#using-npu">Using NPU</a></li>
</ul>
</li>
<li><a href="#usage">Usage</a>
<ul>
<li><a href="#command-line">Command Line</a></li>
<li><a href="#api">API</a></li>
<li><a href="#deploy-derived-projects">Deploy Derived Projects</a></li>
<li><a href="#development-guide">Development Guide</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="#todo">TODO</a></li>
<li><a href="#known-issues">Known Issues</a></li>
<li><a href="#faq">FAQ</a></li>
<li><a href="#all-thanks-to-our-contributors">All Thanks To Our Contributors</a></li>
<li><a href="#license-information">License Information</a></li>
<li><a href="#acknowledgments">Acknowledgments</a></li>
<li><a href="#citation">Citation</a></li>
<li><a href="#star-history">Star History</a></li>
<li><a href="#magic-doc">Magic-doc</a></li>
<li><a href="#magic-html">Magic-html</a></li>
<li><a href="#links">Links</a></li>
</ol>
</details>
<details open="open">
<summary><h2 style="display: inline-block">Table of Contents</h2></summary>
<ol>
<li>
<a href="#mineru">MinerU</a>
<ul>
<li><a href="#project-introduction">Project Introduction</a></li>
<li><a href="#key-features">Key Features</a></li>
<li><a href="#quick-start">Quick Start</a>
<ul>
<li><a href="#online-demo">Online Demo</a></li>
<li><a href="#quick-cpu-demo">Local Deployment</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="#todo">TODO</a></li>
<li><a href="#known-issues">Known Issues</a></li>
<li><a href="#faq">FAQ</a></li>
<li><a href="#all-thanks-to-our-contributors">All Thanks To Our Contributors</a></li>
<li><a href="#license-information">License Information</a></li>
<li><a href="#acknowledgments">Acknowledgments</a></li>
<li><a href="#citation">Citation</a></li>
<li><a href="#star-history">Star History</a></li>
<li><a href="#magic-doc">Magic-doc</a></li>
<li><a href="#magic-html">Magic-html</a></li>
<li><a href="#links">Links</a></li>
</ol>
</details>
# MinerU
@@ -717,6 +707,7 @@ Currently, some models in this project are trained based on YOLO. However, since
- [xy-cut](https://github.com/Sanster/xy-cut)
- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
- [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
- [pdftext](https://github.com/datalab-to/pdftext)
- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
- [pypdf](https://github.com/py-pdf/pypdf)

View File

@@ -336,49 +336,38 @@
<details>
<summary>2024/07/05 首次开源</summary>
</details>
<!-- TABLE OF CONTENT -->
<details open="open">
<summary><h2 style="display: inline-block">文档目录</h2></summary>
<ol>
<li>
<a href="#mineru">MinerU</a>
<ul>
<li><a href="#项目简介">项目简介</a></li>
<li><a href="#主要功能">主要功能</a></li>
<li><a href="#快速开始">快速开始</a>
<ul>
<li><a href="#在线体验">在线体验</a></li>
<li><a href="#使用CPU快速体验">使用CPU快速体验</a></li>
<li><a href="#使用GPU">使用GPU</a></li>
<li><a href="#使用NPU">使用NPU</a></li>
</ul>
</li>
<li><a href="#使用">使用方式</a>
<ul>
<li><a href="#命令行">命令行</a></li>
<li><a href="#api">API</a></li>
<li><a href="#部署衍生项目">部署衍生项目</a></li>
<li><a href="#二次开发">二次开发</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="#todo">TODO</a></li>
<li><a href="#known-issues">Known Issues</a></li>
<li><a href="#faq">FAQ</a></li>
<li><a href="#all-thanks-to-our-contributors">Contributors</a></li>
<li><a href="#license-information">License Information</a></li>
<li><a href="#acknowledgments">Acknowledgements</a></li>
<li><a href="#citation">Citation</a></li>
<li><a href="#star-history">Star History</a></li>
<li><a href="#magic-doc">magic-doc快速提取PPT/DOC/PDF</a></li>
<li><a href="#magic-html">magic-html提取混合网页内容</a></li>
<li><a href="#links">Links</a></li>
</ol>
</details>
</details>
<!-- TABLE OF CONTENT -->
<details open="open">
<summary><h2 style="display: inline-block">文档目录</h2></summary>
<ol>
<li>
<a href="#mineru">MinerU</a>
<ul>
<li><a href="#项目简介">项目简介</a></li>
<li><a href="#主要功能">主要功能</a></li>
<li><a href="#快速开始">快速开始</a>
<ul>
<li><a href="#在线体验">在线体验</a></li>
<li><a href="#本地部署">本地部署</a></li>
</ul>
</ul>
</li>
<li><a href="#todo">TODO</a></li>
<li><a href="#known-issues">Known Issues</a></li>
<li><a href="#faq">FAQ</a></li>
<li><a href="#all-thanks-to-our-contributors">Contributors</a></li>
<li><a href="#license-information">License Information</a></li>
<li><a href="#acknowledgments">Acknowledgements</a></li>
<li><a href="#citation">Citation</a></li>
<li><a href="#star-history">Star History</a></li>
<li><a href="#magic-doc">magic-doc快速提取PPT/DOC/PDF</a></li>
<li><a href="#magic-html">magic-html提取混合网页内容</a></li>
<li><a href="#links">Links</a></li>
</ol>
</details>
@@ -710,6 +699,7 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1
- [xy-cut](https://github.com/Sanster/xy-cut)
- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
- [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
- [pdftext](https://github.com/datalab-to/pdftext)
- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
- [pypdf](https://github.com/py-pdf/pypdf)

31
SECURITY.md Normal file
View File

@@ -0,0 +1,31 @@
# Security Policy
## Supported Versions
latest
## Reporting a Vulnerability
Please do not report security vulnerabilities through public GitHub issues.
Instead, please report them at https://github.com/opendatalab/MinerU/security.
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
* Full paths of source file(s) related to the manifestation of the issue
* The location of the affected source code (tag/branch/commit or direct URL)
* Any special configuration required to reproduce the issue
* Step-by-step instructions to reproduce the issue
* Proof-of-concept or exploit code (if possible)
* Impact of the issue, including how an attacker might exploit the issue
This information will help us triage your report more quickly.
## Preferred Languages
We prefer all communications to be in English and Chinese.
## Policy
We will fix security issues in the project's own code as quickly as possible. Before the project completes the fix, you must not disclose the vulnerability information to any public platform.

View File

@@ -42,9 +42,8 @@ def download_and_modify_json(url, local_filename, modifications):
def configure_model(model_dir, model_type):
"""配置模型"""
# json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/mineru.template.json'
json_url = 'https://gcore.jsdelivr.net/gh/myhloli/Magic-PDF@dev/mineru.template.json'
config_file_name = 'mineru.json'
json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/mineru.template.json'
config_file_name = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'mineru.json')
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
@@ -120,13 +119,13 @@ def download_models(model_source, model_type):
click.echo(f"Downloading model: {model_path}")
download_finish_path = auto_download_and_get_model_root_path(model_path, repo_mode='pipeline')
click.echo(f"Pipeline models downloaded successfully to: {download_finish_path}")
configure_model(download_finish_path, model_type)
configure_model(download_finish_path, "pipeline")
def download_vlm_models():
"""下载VLM模型"""
download_finish_path = auto_download_and_get_model_root_path("/", repo_mode='vlm')
click.echo(f"VLM models downloaded successfully to: {download_finish_path}")
configure_model(download_finish_path, model_type)
configure_model(download_finish_path, "vlm")
try:
if model_type == 'pipeline':

View File

@@ -1 +1 @@
__version__ = "2.0.0"
__version__ = "2.0.0"

View File

@@ -263,6 +263,46 @@
"created_at": "2025-04-30T09:25:31Z",
"repoId": 765083837,
"pullRequestNo": 2411
},
{
"name": "seedclaimer",
"id": 86753366,
"comment_id": 2916194375,
"created_at": "2025-05-28T12:50:25Z",
"repoId": 765083837,
"pullRequestNo": 2536
},
{
"name": "liuzhenghua",
"id": 11787325,
"comment_id": 2921092605,
"created_at": "2025-05-30T02:57:07Z",
"repoId": 765083837,
"pullRequestNo": 2550
},
{
"name": "PairZhu",
"id": 47098840,
"comment_id": 2938149702,
"created_at": "2025-06-04T02:39:39Z",
"repoId": 765083837,
"pullRequestNo": 2566
},
{
"name": "AdrianWangs",
"id": 72337244,
"comment_id": 2943818300,
"created_at": "2025-06-05T11:30:42Z",
"repoId": 765083837,
"pullRequestNo": 2578
},
{
"name": "YanzhenHuang",
"id": 86364920,
"comment_id": 2968974232,
"created_at": "2025-06-13T04:17:08Z",
"repoId": 765083837,
"pullRequestNo": 2620
}
]
}