mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 19:18:34 +07:00
Compare commits
21 Commits
release-2.
...
release-2.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d77d5edddb | ||
|
|
a812ae899e | ||
|
|
1793bdfc7d | ||
|
|
6e54a68cef | ||
|
|
6e39928204 | ||
|
|
52537958ec | ||
|
|
d8989ed116 | ||
|
|
9669111faf | ||
|
|
fdca2c8ef0 | ||
|
|
91208fb1bd | ||
|
|
3376f3a7d9 | ||
|
|
c5480b9d39 | ||
|
|
97c1362e3c | ||
|
|
28588d7c65 | ||
|
|
6ab123487b | ||
|
|
9487d33d7b | ||
|
|
46f7e0f532 | ||
|
|
efba5d4594 | ||
|
|
a911c29fbb | ||
|
|
0ac5623ad6 | ||
|
|
113a3ad91f |
73
README.md
73
README.md
@@ -347,48 +347,38 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
||||
<details>
|
||||
<summary>2024/07/05 Initial open-source release</summary>
|
||||
</details>
|
||||
</details>
|
||||
|
||||
<!-- TABLE OF CONTENT -->
|
||||
|
||||
<!-- TABLE OF CONTENT -->
|
||||
|
||||
<details open="open">
|
||||
<summary><h2 style="display: inline-block">Table of Contents</h2></summary>
|
||||
<ol>
|
||||
<li>
|
||||
<a href="#mineru">MinerU</a>
|
||||
<ul>
|
||||
<li><a href="#project-introduction">Project Introduction</a></li>
|
||||
<li><a href="#key-features">Key Features</a></li>
|
||||
<li><a href="#quick-start">Quick Start</a>
|
||||
<ul>
|
||||
<li><a href="#online-demo">Online Demo</a></li>
|
||||
<li><a href="#quick-cpu-demo">Quick CPU Demo</a></li>
|
||||
<li><a href="#using-gpu">Using GPU</a></li>
|
||||
<li><a href="#using-npu">Using NPU</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><a href="#usage">Usage</a>
|
||||
<ul>
|
||||
<li><a href="#command-line">Command Line</a></li>
|
||||
<li><a href="#api">API</a></li>
|
||||
<li><a href="#deploy-derived-projects">Deploy Derived Projects</a></li>
|
||||
<li><a href="#development-guide">Development Guide</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><a href="#todo">TODO</a></li>
|
||||
<li><a href="#known-issues">Known Issues</a></li>
|
||||
<li><a href="#faq">FAQ</a></li>
|
||||
<li><a href="#all-thanks-to-our-contributors">All Thanks To Our Contributors</a></li>
|
||||
<li><a href="#license-information">License Information</a></li>
|
||||
<li><a href="#acknowledgments">Acknowledgments</a></li>
|
||||
<li><a href="#citation">Citation</a></li>
|
||||
<li><a href="#star-history">Star History</a></li>
|
||||
<li><a href="#magic-doc">Magic-doc</a></li>
|
||||
<li><a href="#magic-html">Magic-html</a></li>
|
||||
<li><a href="#links">Links</a></li>
|
||||
</ol>
|
||||
</details>
|
||||
<details open="open">
|
||||
<summary><h2 style="display: inline-block">Table of Contents</h2></summary>
|
||||
<ol>
|
||||
<li>
|
||||
<a href="#mineru">MinerU</a>
|
||||
<ul>
|
||||
<li><a href="#project-introduction">Project Introduction</a></li>
|
||||
<li><a href="#key-features">Key Features</a></li>
|
||||
<li><a href="#quick-start">Quick Start</a>
|
||||
<ul>
|
||||
<li><a href="#online-demo">Online Demo</a></li>
|
||||
<li><a href="#quick-cpu-demo">Local Deployment</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><a href="#todo">TODO</a></li>
|
||||
<li><a href="#known-issues">Known Issues</a></li>
|
||||
<li><a href="#faq">FAQ</a></li>
|
||||
<li><a href="#all-thanks-to-our-contributors">All Thanks To Our Contributors</a></li>
|
||||
<li><a href="#license-information">License Information</a></li>
|
||||
<li><a href="#acknowledgments">Acknowledgments</a></li>
|
||||
<li><a href="#citation">Citation</a></li>
|
||||
<li><a href="#star-history">Star History</a></li>
|
||||
<li><a href="#magic-doc">Magic-doc</a></li>
|
||||
<li><a href="#magic-html">Magic-html</a></li>
|
||||
<li><a href="#links">Links</a></li>
|
||||
</ol>
|
||||
</details>
|
||||
|
||||
# MinerU
|
||||
@@ -717,6 +707,7 @@ Currently, some models in this project are trained based on YOLO. However, since
|
||||
- [xy-cut](https://github.com/Sanster/xy-cut)
|
||||
- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
|
||||
- [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
|
||||
- [pdftext](https://github.com/datalab-to/pdftext)
|
||||
- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
|
||||
- [pypdf](https://github.com/py-pdf/pypdf)
|
||||
|
||||
|
||||
@@ -336,49 +336,38 @@
|
||||
<details>
|
||||
<summary>2024/07/05 首次开源</summary>
|
||||
</details>
|
||||
|
||||
|
||||
<!-- TABLE OF CONTENT -->
|
||||
|
||||
<details open="open">
|
||||
<summary><h2 style="display: inline-block">文档目录</h2></summary>
|
||||
<ol>
|
||||
<li>
|
||||
<a href="#mineru">MinerU</a>
|
||||
<ul>
|
||||
<li><a href="#项目简介">项目简介</a></li>
|
||||
<li><a href="#主要功能">主要功能</a></li>
|
||||
<li><a href="#快速开始">快速开始</a>
|
||||
<ul>
|
||||
<li><a href="#在线体验">在线体验</a></li>
|
||||
<li><a href="#使用CPU快速体验">使用CPU快速体验</a></li>
|
||||
<li><a href="#使用GPU">使用GPU</a></li>
|
||||
<li><a href="#使用NPU">使用NPU</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><a href="#使用">使用方式</a>
|
||||
<ul>
|
||||
<li><a href="#命令行">命令行</a></li>
|
||||
<li><a href="#api">API</a></li>
|
||||
<li><a href="#部署衍生项目">部署衍生项目</a></li>
|
||||
<li><a href="#二次开发">二次开发</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><a href="#todo">TODO</a></li>
|
||||
<li><a href="#known-issues">Known Issues</a></li>
|
||||
<li><a href="#faq">FAQ</a></li>
|
||||
<li><a href="#all-thanks-to-our-contributors">Contributors</a></li>
|
||||
<li><a href="#license-information">License Information</a></li>
|
||||
<li><a href="#acknowledgments">Acknowledgements</a></li>
|
||||
<li><a href="#citation">Citation</a></li>
|
||||
<li><a href="#star-history">Star History</a></li>
|
||||
<li><a href="#magic-doc">magic-doc快速提取PPT/DOC/PDF</a></li>
|
||||
<li><a href="#magic-html">magic-html提取混合网页内容</a></li>
|
||||
<li><a href="#links">Links</a></li>
|
||||
</ol>
|
||||
</details>
|
||||
</details>
|
||||
|
||||
|
||||
<!-- TABLE OF CONTENT -->
|
||||
|
||||
<details open="open">
|
||||
<summary><h2 style="display: inline-block">文档目录</h2></summary>
|
||||
<ol>
|
||||
<li>
|
||||
<a href="#mineru">MinerU</a>
|
||||
<ul>
|
||||
<li><a href="#项目简介">项目简介</a></li>
|
||||
<li><a href="#主要功能">主要功能</a></li>
|
||||
<li><a href="#快速开始">快速开始</a>
|
||||
<ul>
|
||||
<li><a href="#在线体验">在线体验</a></li>
|
||||
<li><a href="#本地部署">本地部署</a></li>
|
||||
</ul>
|
||||
</ul>
|
||||
</li>
|
||||
<li><a href="#todo">TODO</a></li>
|
||||
<li><a href="#known-issues">Known Issues</a></li>
|
||||
<li><a href="#faq">FAQ</a></li>
|
||||
<li><a href="#all-thanks-to-our-contributors">Contributors</a></li>
|
||||
<li><a href="#license-information">License Information</a></li>
|
||||
<li><a href="#acknowledgments">Acknowledgements</a></li>
|
||||
<li><a href="#citation">Citation</a></li>
|
||||
<li><a href="#star-history">Star History</a></li>
|
||||
<li><a href="#magic-doc">magic-doc快速提取PPT/DOC/PDF</a></li>
|
||||
<li><a href="#magic-html">magic-html提取混合网页内容</a></li>
|
||||
<li><a href="#links">Links</a></li>
|
||||
</ol>
|
||||
</details>
|
||||
|
||||
|
||||
@@ -710,6 +699,7 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1
|
||||
- [xy-cut](https://github.com/Sanster/xy-cut)
|
||||
- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
|
||||
- [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
|
||||
- [pdftext](https://github.com/datalab-to/pdftext)
|
||||
- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
|
||||
- [pypdf](https://github.com/py-pdf/pypdf)
|
||||
|
||||
|
||||
31
SECURITY.md
Normal file
31
SECURITY.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Security Policy
|
||||
|
||||
## Supported Versions
|
||||
|
||||
latest
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
Please do not report security vulnerabilities through public GitHub issues.
|
||||
|
||||
Instead, please report them at https://github.com/opendatalab/MinerU/security.
|
||||
|
||||
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
|
||||
|
||||
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
|
||||
* Full paths of source file(s) related to the manifestation of the issue
|
||||
* The location of the affected source code (tag/branch/commit or direct URL)
|
||||
* Any special configuration required to reproduce the issue
|
||||
* Step-by-step instructions to reproduce the issue
|
||||
* Proof-of-concept or exploit code (if possible)
|
||||
* Impact of the issue, including how an attacker might exploit the issue
|
||||
|
||||
This information will help us triage your report more quickly.
|
||||
|
||||
## Preferred Languages
|
||||
|
||||
We prefer all communications to be in English and Chinese.
|
||||
|
||||
## Policy
|
||||
|
||||
We will fix security issues in the project's own code as quickly as possible. Before the project completes the fix, you must not disclose the vulnerability information to any public platform.
|
||||
@@ -42,9 +42,8 @@ def download_and_modify_json(url, local_filename, modifications):
|
||||
|
||||
def configure_model(model_dir, model_type):
|
||||
"""配置模型"""
|
||||
# json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/mineru.template.json'
|
||||
json_url = 'https://gcore.jsdelivr.net/gh/myhloli/Magic-PDF@dev/mineru.template.json'
|
||||
config_file_name = 'mineru.json'
|
||||
json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/mineru.template.json'
|
||||
config_file_name = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'mineru.json')
|
||||
home_dir = os.path.expanduser('~')
|
||||
config_file = os.path.join(home_dir, config_file_name)
|
||||
|
||||
@@ -120,13 +119,13 @@ def download_models(model_source, model_type):
|
||||
click.echo(f"Downloading model: {model_path}")
|
||||
download_finish_path = auto_download_and_get_model_root_path(model_path, repo_mode='pipeline')
|
||||
click.echo(f"Pipeline models downloaded successfully to: {download_finish_path}")
|
||||
configure_model(download_finish_path, model_type)
|
||||
configure_model(download_finish_path, "pipeline")
|
||||
|
||||
def download_vlm_models():
|
||||
"""下载VLM模型"""
|
||||
download_finish_path = auto_download_and_get_model_root_path("/", repo_mode='vlm')
|
||||
click.echo(f"VLM models downloaded successfully to: {download_finish_path}")
|
||||
configure_model(download_finish_path, model_type)
|
||||
configure_model(download_finish_path, "vlm")
|
||||
|
||||
try:
|
||||
if model_type == 'pipeline':
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.0.0"
|
||||
__version__ = "2.0.0"
|
||||
|
||||
@@ -263,6 +263,46 @@
|
||||
"created_at": "2025-04-30T09:25:31Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 2411
|
||||
},
|
||||
{
|
||||
"name": "seedclaimer",
|
||||
"id": 86753366,
|
||||
"comment_id": 2916194375,
|
||||
"created_at": "2025-05-28T12:50:25Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 2536
|
||||
},
|
||||
{
|
||||
"name": "liuzhenghua",
|
||||
"id": 11787325,
|
||||
"comment_id": 2921092605,
|
||||
"created_at": "2025-05-30T02:57:07Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 2550
|
||||
},
|
||||
{
|
||||
"name": "PairZhu",
|
||||
"id": 47098840,
|
||||
"comment_id": 2938149702,
|
||||
"created_at": "2025-06-04T02:39:39Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 2566
|
||||
},
|
||||
{
|
||||
"name": "AdrianWangs",
|
||||
"id": 72337244,
|
||||
"comment_id": 2943818300,
|
||||
"created_at": "2025-06-05T11:30:42Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 2578
|
||||
},
|
||||
{
|
||||
"name": "YanzhenHuang",
|
||||
"id": 86364920,
|
||||
"comment_id": 2968974232,
|
||||
"created_at": "2025-06-13T04:17:08Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 2620
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user