mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-30 20:48:38 +07:00
Compare commits
30 Commits
mineru-3.0
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ff2caf3a84 | ||
|
|
be675a5a97 | ||
|
|
fa5f8d61f9 | ||
|
|
a6f250b89c | ||
|
|
2bda24815a | ||
|
|
980670a649 | ||
|
|
84d1384c2b | ||
|
|
de8ea317ba | ||
|
|
d0d2bf920b | ||
|
|
6a5a2efc50 | ||
|
|
1cfa71be1c | ||
|
|
66df0cd367 | ||
|
|
9063a4133d | ||
|
|
b85d18dca1 | ||
|
|
016169ce2e | ||
|
|
c635892b75 | ||
|
|
d9778814fa | ||
|
|
62054c9528 | ||
|
|
de77fc3af5 | ||
|
|
f913df4ec6 | ||
|
|
264c594b23 | ||
|
|
a6b6d3081c | ||
|
|
1f82300d1d | ||
|
|
2d4fa2cc8e | ||
|
|
71b9e9f780 | ||
|
|
5e51ab2934 | ||
|
|
520c61faaf | ||
|
|
72601b314a | ||
|
|
e54c67dcd1 | ||
|
|
33e4fbd694 |
@@ -12,8 +12,7 @@ services:
|
||||
command:
|
||||
--host 0.0.0.0
|
||||
--port 30000
|
||||
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
|
||||
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
# --gpu-memory-utilization 0.5 # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
@@ -42,8 +41,7 @@ services:
|
||||
--host 0.0.0.0
|
||||
--port 8000
|
||||
# parameters for vllm-engine
|
||||
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
|
||||
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
# --gpu-memory-utilization 0.5 # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
@@ -58,6 +56,40 @@ services:
|
||||
device_ids: ["0"] # Modify for multiple GPUs: ["0", "1"]
|
||||
capabilities: [gpu]
|
||||
|
||||
mineru-router:
|
||||
image: mineru:latest
|
||||
container_name: mineru-router
|
||||
restart: always
|
||||
profiles: ["router"]
|
||||
ports:
|
||||
- 8002:8002
|
||||
environment:
|
||||
MINERU_MODEL_SOURCE: local
|
||||
entrypoint: mineru-router
|
||||
command:
|
||||
--host 0.0.0.0
|
||||
--port 8002
|
||||
--local-gpus auto
|
||||
# To aggregate existing mineru-api services instead of starting local workers:
|
||||
# --local-gpus none
|
||||
# --upstream-url http://mineru-api:8000
|
||||
# --upstream-url http://mineru-api-2:8000
|
||||
# parameters for vllm-engine
|
||||
# --gpu-memory-utilization 0.5 # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
ipc: host
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:8002/health || exit 1"]
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ["0"] # Modify for multiple GPUs: ["0", "1"]
|
||||
capabilities: [gpu]
|
||||
|
||||
mineru-gradio:
|
||||
image: mineru:latest
|
||||
container_name: mineru-gradio
|
||||
@@ -74,8 +106,7 @@ services:
|
||||
# --enable-api false # If you want to disable the API, set this to false
|
||||
# --max-convert-pages 20 # If you want to limit the number of pages for conversion, set this to a specific number
|
||||
# parameters for vllm-engine
|
||||
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
|
||||
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
# --gpu-memory-utilization 0.5 # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
|
||||
@@ -25,7 +25,7 @@ MinerU's Docker uses `vllm/vllm-openai` as the base image, so it includes the `v
|
||||
```bash
|
||||
docker run --gpus all \
|
||||
--shm-size 32g \
|
||||
-p 30000:30000 -p 7860:7860 -p 8000:8000 \
|
||||
-p 30000:30000 -p 7860:7860 -p 8000:8000 -p 8002:8002 \
|
||||
--ipc=host \
|
||||
-it mineru:latest \
|
||||
/bin/bash
|
||||
@@ -73,6 +73,17 @@ connect to `openai-server` via `vlm-http-client` backend
|
||||
|
||||
---
|
||||
|
||||
### Start MinerU Router service
|
||||
```bash
|
||||
docker compose -f compose.yaml --profile router up -d
|
||||
```
|
||||
>[!TIP]
|
||||
>
|
||||
>- The default configuration runs in `--local-gpus auto` mode, automatically starting local workers in the container and exposing the unified entry at `http://<server_ip>:8002/docs`.
|
||||
>- If you want to aggregate existing `mineru-api` services instead of starting local workers, refer to the commented example under the `mineru-router` service in `compose.yaml` and switch to `--upstream-url`.
|
||||
|
||||
---
|
||||
|
||||
### Start Gradio WebUI service
|
||||
```bash
|
||||
docker compose -f compose.yaml --profile gradio up -d
|
||||
|
||||
@@ -50,7 +50,7 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Accuracy<sup>1</sup></th>
|
||||
<td style="text-align:center;">82+</td>
|
||||
<td style="text-align:center;">86+</td>
|
||||
<td colspan="4" style="text-align:center;">90+</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@@ -70,15 +70,15 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Min VRAM</th>
|
||||
<td style="text-align:center;">6GB</td>
|
||||
<td style="text-align:center;">10GB</td>
|
||||
<td style="text-align:center;">4GB</td>
|
||||
<td style="text-align:center;">8GB</td>
|
||||
<td style="text-align:center;">3GB</td>
|
||||
<td style="text-align:center;">8GB</td>
|
||||
<td style="text-align:center;">2GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>RAM</th>
|
||||
<td colspan="3" style="text-align:center;">Min 16GB+, Recommended 32GB+</td>
|
||||
<td colspan="2" style="text-align:center;">8GB</td>
|
||||
<td colspan="2" style="text-align:center;">16GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Disk Space</th>
|
||||
|
||||
@@ -104,6 +104,8 @@ If you need to adjust parsing options through custom parameters, you can also ch
|
||||
> ```bash
|
||||
> mineru -p <input_path> -o <output_path> -b hybrid-http-client -u http://127.0.0.1:30000
|
||||
> ```
|
||||
>`vlm-http-client` is the lightweight remote client option and does not require local `torch`.
|
||||
>`hybrid-http-client` requires local pipeline dependencies such as `mineru[pipeline]` and `torch`.
|
||||
|
||||
> [!NOTE]
|
||||
> All officially supported `vllm/lmdeploy` parameters can be passed to MinerU through command line arguments, including the following commands: `mineru`, `mineru-openai-server`, `mineru-gradio`, `mineru-api`, `mineru-router`.
|
||||
|
||||
@@ -25,7 +25,7 @@ Mineru的docker使用了`vllm/vllm-openai`作为基础镜像,因此在docker
|
||||
```bash
|
||||
docker run --gpus all \
|
||||
--shm-size 32g \
|
||||
-p 30000:30000 -p 7860:7860 -p 8000:8000 \
|
||||
-p 30000:30000 -p 7860:7860 -p 8000:8000 -p 8002:8002 \
|
||||
--ipc=host \
|
||||
-it mineru:latest \
|
||||
/bin/bash
|
||||
@@ -72,10 +72,21 @@ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/compose.yaml
|
||||
|
||||
---
|
||||
|
||||
### 启动 MinerU Router 服务
|
||||
```bash
|
||||
docker compose -f compose.yaml --profile router up -d
|
||||
```
|
||||
>[!TIP]
|
||||
>
|
||||
>- 默认配置会以 `--local-gpus auto` 模式在容器内自动拉起本地 worker,并通过 `http://<server_ip>:8002/docs` 暴露统一入口。
|
||||
>- 如果您希望聚合已有的 `mineru-api` 服务而不是启动本地 worker,可直接参考 `compose.yaml` 中 `mineru-router` 服务下的注释示例,改为使用 `--upstream-url`。
|
||||
|
||||
---
|
||||
|
||||
### 启动 Gradio WebUI 服务
|
||||
```bash
|
||||
docker compose -f compose.yaml --profile gradio up -d
|
||||
```
|
||||
>[!TIP]
|
||||
>
|
||||
>- 在浏览器中访问 `http://<server_ip>:7860` 使用 Gradio WebUI。
|
||||
>- 在浏览器中访问 `http://<server_ip>:7860` 使用 Gradio WebUI。
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
</tr>
|
||||
<tr>
|
||||
<th>精度指标<sup>1</sup></th>
|
||||
<td style="text-align:center;">82+</td>
|
||||
<td style="text-align:center;">86+</td>
|
||||
<td colspan="4" style="text-align:center;">90+</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@@ -70,15 +70,15 @@
|
||||
</tr>
|
||||
<tr>
|
||||
<th>显存最低要求</th>
|
||||
<td style="text-align:center;">6GB</td>
|
||||
<td style="text-align:center;">10GB</td>
|
||||
<td style="text-align:center;">4GB</td>
|
||||
<td style="text-align:center;">8GB</td>
|
||||
<td style="text-align:center;">3GB</td>
|
||||
<td style="text-align:center;">8GB</td>
|
||||
<td style="text-align:center;">2GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>内存要求</th>
|
||||
<td colspan="3" style="text-align:center;">最低16GB以上,推荐32GB以上</td>
|
||||
<td colspan="2" style="text-align:center;">8GB</td>
|
||||
<td colspan="2" style="text-align:center;">16GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>磁盘空间要求</th>
|
||||
|
||||
@@ -104,6 +104,8 @@ mineru -p <input_path> -o <output_path>
|
||||
> ```bash
|
||||
> mineru -p <input_path> -o <output_path> -b hybrid-http-client -u http://127.0.0.1:30000
|
||||
> ```
|
||||
>`vlm-http-client` 是轻量远程 client,用法上不要求本地安装 `torch`。
|
||||
>`hybrid-http-client` 需要本地具备 `mineru[pipeline]` 及 `torch` 等 pipeline 依赖。
|
||||
|
||||
> [!NOTE]
|
||||
> 所有`vllm/lmdeploy`官方支持的参数都可用通过命令行参数传递给 MinerU,包括以下命令:`mineru`、`mineru-openai-server`、`mineru-gradio`、`mineru-api`、`mineru-router`,
|
||||
|
||||
@@ -131,23 +131,27 @@ def blocks_to_page_info(
|
||||
return page_info
|
||||
|
||||
|
||||
def _iter_block_spans(block):
|
||||
for line in block.get("lines", []):
|
||||
for span in line.get("spans", []):
|
||||
yield span
|
||||
|
||||
for sub_block in block.get("blocks", []):
|
||||
yield from _iter_block_spans(sub_block)
|
||||
|
||||
|
||||
def _apply_post_ocr(pdf_info_list, hybrid_pipeline_model):
|
||||
need_ocr_list = []
|
||||
img_crop_list = []
|
||||
text_block_list = []
|
||||
for page_info in pdf_info_list:
|
||||
for block in page_info['para_blocks']:
|
||||
if block['type'] in ['table', 'image', 'list', 'code']:
|
||||
for sub_block in block['blocks']:
|
||||
if not sub_block['type'].endswith('body'):
|
||||
text_block_list.append(sub_block)
|
||||
elif block['type'] in ['text', 'title', 'ref_text']:
|
||||
text_block_list.append(block)
|
||||
for block in page_info['discarded_blocks']:
|
||||
text_block_list.append(block)
|
||||
for block in text_block_list:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
for block in page_info.get('para_blocks', []):
|
||||
for span in _iter_block_spans(block):
|
||||
if 'np_img' in span:
|
||||
need_ocr_list.append(span)
|
||||
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
|
||||
span.pop('np_img')
|
||||
for block in page_info.get('discarded_blocks', []):
|
||||
for span in _iter_block_spans(block):
|
||||
if 'np_img' in span:
|
||||
need_ocr_list.append(span)
|
||||
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
|
||||
|
||||
@@ -203,6 +203,15 @@ def _extract_text_from_block(block):
|
||||
return "".join(text_parts).strip()
|
||||
|
||||
|
||||
def _iter_block_spans(block):
|
||||
for line in block.get("lines", []):
|
||||
for span in line.get("spans", []):
|
||||
yield span
|
||||
|
||||
for sub_block in block.get("blocks", []):
|
||||
yield from _iter_block_spans(sub_block)
|
||||
|
||||
|
||||
def _normalize_formula_tag_content(tag_content):
|
||||
tag_content = full_to_half(tag_content.strip())
|
||||
if tag_content.startswith("("):
|
||||
@@ -261,22 +270,18 @@ def _optimize_formula_number_blocks(pdf_info_list):
|
||||
def _apply_post_ocr(pdf_info_list, lang=None):
|
||||
need_ocr_list = []
|
||||
img_crop_list = []
|
||||
text_block_list = []
|
||||
|
||||
for page_info in pdf_info_list:
|
||||
for block in page_info['preproc_blocks']:
|
||||
if 'blocks' in block:
|
||||
for sub_block in block['blocks']:
|
||||
if sub_block.get("type", "").endswith('caption') or sub_block.get("type", "").endswith('footnote'):
|
||||
text_block_list.append(sub_block)
|
||||
elif block["type"] not in [BlockType.INTERLINE_EQUATION, BlockType.SEAL]:
|
||||
text_block_list.append(block)
|
||||
for block in page_info['discarded_blocks']:
|
||||
text_block_list.append(block)
|
||||
for block in page_info.get('preproc_blocks', []):
|
||||
for span in _iter_block_spans(block):
|
||||
if 'np_img' in span:
|
||||
need_ocr_list.append(span)
|
||||
# Keep post-OCR rec aligned with the main OCR pipeline for vertical tall crops.
|
||||
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
|
||||
span.pop('np_img')
|
||||
|
||||
for block in text_block_list:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
for block in page_info.get('discarded_blocks', []):
|
||||
for span in _iter_block_spans(block):
|
||||
if 'np_img' in span:
|
||||
need_ocr_list.append(span)
|
||||
# Keep post-OCR rec aligned with the main OCR pipeline for vertical tall crops.
|
||||
|
||||
@@ -32,6 +32,7 @@ TASKS_ENDPOINT = "/tasks"
|
||||
TASK_STATUS_POLL_INTERVAL_SECONDS = 1.0
|
||||
TASK_RESULT_TIMEOUT_SECONDS = 3600
|
||||
LOCAL_API_STARTUP_TIMEOUT_SECONDS = 30
|
||||
LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS = 10
|
||||
LOCAL_API_CLEANUP_RETRIES = 8
|
||||
LOCAL_API_CLEANUP_RETRY_INTERVAL_SECONDS = 0.25
|
||||
|
||||
@@ -87,6 +88,7 @@ class LocalAPIServer:
|
||||
read_max_concurrent_requests(default=DEFAULT_MAX_CONCURRENT_REQUESTS)
|
||||
)
|
||||
env["MINERU_API_DISABLE_ACCESS_LOG"] = "1"
|
||||
env["MINERU_API_SHUTDOWN_ON_STDIN_EOF"] = "1"
|
||||
self.output_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
command = [
|
||||
@@ -103,6 +105,7 @@ class LocalAPIServer:
|
||||
command,
|
||||
cwd=os.getcwd(),
|
||||
env=env,
|
||||
stdin=subprocess.PIPE,
|
||||
)
|
||||
|
||||
if not self._atexit_registered:
|
||||
@@ -115,12 +118,23 @@ class LocalAPIServer:
|
||||
self.process = None
|
||||
try:
|
||||
if process is not None and process.poll() is None:
|
||||
process.terminate()
|
||||
if process.stdin is not None and not process.stdin.closed:
|
||||
process.stdin.close()
|
||||
try:
|
||||
process.wait(timeout=5)
|
||||
process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.debug(
|
||||
"Local mineru-api did not stop after stdin EOF within {}s. Falling back to SIGTERM.",
|
||||
LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS,
|
||||
)
|
||||
process.terminate()
|
||||
try:
|
||||
process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
|
||||
return
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
process.kill()
|
||||
process.wait(timeout=5)
|
||||
process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
|
||||
finally:
|
||||
if self._atexit_registered:
|
||||
try:
|
||||
@@ -475,7 +489,17 @@ async def wait_for_task_result(
|
||||
) -> None:
|
||||
deadline = asyncio.get_running_loop().time() + timeout_seconds
|
||||
while asyncio.get_running_loop().time() < deadline:
|
||||
response = await client.get(submit_response.status_url)
|
||||
try:
|
||||
response = await client.get(submit_response.status_url)
|
||||
except httpx.ReadTimeout:
|
||||
logger.warning(
|
||||
"Timed out while polling task status for {} (task_id={}). "
|
||||
"This can happen during cold start; retrying until the task deadline.",
|
||||
task_label,
|
||||
submit_response.task_id,
|
||||
)
|
||||
await asyncio.sleep(TASK_STATUS_POLL_INTERVAL_SECONDS)
|
||||
continue
|
||||
if response.status_code != 200:
|
||||
raise click.ClickException(
|
||||
f"Failed to query task status for {task_label}: "
|
||||
|
||||
@@ -30,6 +30,8 @@ from mineru.utils.pdfium_guard import (
|
||||
|
||||
from mineru.version import __version__
|
||||
from mineru.cli.common import (
|
||||
HybridDependencyError,
|
||||
ensure_backend_dependencies,
|
||||
image_suffixes,
|
||||
office_suffixes,
|
||||
pdf_suffixes,
|
||||
@@ -847,6 +849,11 @@ async def run_orchestrated_cli(
|
||||
raise click.ClickException("--start must be greater than or equal to 0")
|
||||
if end_page_id is not None and end_page_id < 0:
|
||||
raise click.ClickException("--end must be greater than or equal to 0")
|
||||
if api_url is None:
|
||||
try:
|
||||
ensure_backend_dependencies(backend)
|
||||
except HybridDependencyError as exc:
|
||||
raise click.ClickException(str(exc)) from exc
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
documents = collect_input_documents(
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import importlib
|
||||
import importlib.util
|
||||
import json
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
@@ -40,6 +42,37 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
MAX_TASK_STEM_BYTES = 200
|
||||
|
||||
|
||||
class HybridDependencyError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def build_hybrid_dependency_error_message(backend: str) -> str:
|
||||
return (
|
||||
f"`{backend}` requires local pipeline dependencies (`mineru[pipeline]`, "
|
||||
"including `torch`). Install `mineru[pipeline]` or `mineru[core]`. "
|
||||
"If you need a lightweight remote client without local `torch`, "
|
||||
"use `vlm-http-client` instead."
|
||||
)
|
||||
|
||||
|
||||
def ensure_backend_dependencies(backend: str) -> None:
|
||||
if not backend.startswith("hybrid-"):
|
||||
return
|
||||
if importlib.util.find_spec("torch") is None:
|
||||
raise HybridDependencyError(build_hybrid_dependency_error_message(backend))
|
||||
|
||||
|
||||
def _load_hybrid_analyze_entrypoint(entrypoint_name: str, backend: str):
|
||||
ensure_backend_dependencies(backend)
|
||||
try:
|
||||
hybrid_analyze = importlib.import_module("mineru.backend.hybrid.hybrid_analyze")
|
||||
except (ImportError, ModuleNotFoundError) as exc:
|
||||
raise HybridDependencyError(
|
||||
build_hybrid_dependency_error_message(backend)
|
||||
) from exc
|
||||
return getattr(hybrid_analyze, entrypoint_name)
|
||||
|
||||
|
||||
def utf8_byte_length(value: str) -> int:
|
||||
return len(value.encode("utf-8"))
|
||||
|
||||
@@ -438,7 +471,10 @@ def _process_hybrid(
|
||||
server_url=None,
|
||||
**kwargs,
|
||||
):
|
||||
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
||||
hybrid_doc_analyze = _load_hybrid_analyze_entrypoint(
|
||||
"doc_analyze",
|
||||
f"hybrid-{backend}",
|
||||
)
|
||||
"""同步处理hybrid后端逻辑"""
|
||||
if not backend.endswith("client"):
|
||||
server_url = None
|
||||
@@ -491,7 +527,10 @@ async def _async_process_hybrid(
|
||||
server_url=None,
|
||||
**kwargs,
|
||||
):
|
||||
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
||||
aio_hybrid_doc_analyze = _load_hybrid_analyze_entrypoint(
|
||||
"aio_doc_analyze",
|
||||
f"hybrid-{backend}",
|
||||
)
|
||||
"""异步处理hybrid后端逻辑"""
|
||||
if not backend.endswith("client"):
|
||||
server_url = None
|
||||
@@ -642,6 +681,7 @@ def do_parse(
|
||||
server_url, **kwargs,
|
||||
)
|
||||
elif backend.startswith("hybrid-"):
|
||||
ensure_backend_dependencies(backend)
|
||||
backend = backend[7:]
|
||||
|
||||
if backend == "vllm-async-engine":
|
||||
@@ -734,6 +774,7 @@ async def aio_do_parse(
|
||||
server_url, **kwargs,
|
||||
)
|
||||
elif backend.startswith("hybrid-"):
|
||||
ensure_backend_dependencies(backend)
|
||||
backend = backend[7:]
|
||||
|
||||
if backend == "vllm-engine":
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
import mimetypes
|
||||
import multiprocessing
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
@@ -49,8 +50,8 @@ from mineru.cli.api_protocol import (
|
||||
DEFAULT_PROCESSING_WINDOW_SIZE,
|
||||
)
|
||||
from mineru.utils.cli_parser import arg_parse
|
||||
from mineru.utils.check_sys_env import is_mac_environment
|
||||
from mineru.utils.config_reader import (
|
||||
get_device,
|
||||
get_max_concurrent_requests as read_max_concurrent_requests,
|
||||
get_processing_window_size,
|
||||
)
|
||||
@@ -79,8 +80,7 @@ FILE_PARSE_TASK_RESULT_URL_HEADER = "X-MinerU-Task-Result-Url"
|
||||
|
||||
# 并发控制器
|
||||
_request_semaphore: Optional[asyncio.Semaphore] = None
|
||||
_configured_max_concurrent_requests = 0
|
||||
_mps_parse_lock = threading.Lock()
|
||||
_configured_max_concurrent_requests = 1
|
||||
|
||||
|
||||
def env_flag_enabled(name: str, default: bool = False) -> bool:
|
||||
@@ -90,6 +90,33 @@ def env_flag_enabled(name: str, default: bool = False) -> bool:
|
||||
return value.lower() in ("1", "true", "yes", "on")
|
||||
|
||||
|
||||
def is_main_multiprocessing_process() -> bool:
|
||||
try:
|
||||
return multiprocessing.current_process().name == "MainProcess"
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
|
||||
def install_stdin_shutdown_watcher(server: uvicorn.Server) -> None:
|
||||
if not env_flag_enabled("MINERU_API_SHUTDOWN_ON_STDIN_EOF"):
|
||||
return
|
||||
|
||||
def _watch_stdin_for_eof() -> None:
|
||||
stdin_stream = getattr(sys.stdin, "buffer", sys.stdin)
|
||||
try:
|
||||
stdin_stream.read()
|
||||
except Exception:
|
||||
return
|
||||
server.should_exit = True
|
||||
|
||||
watcher = threading.Thread(
|
||||
target=_watch_stdin_for_eof,
|
||||
name="mineru-api-stdin-shutdown",
|
||||
daemon=True,
|
||||
)
|
||||
watcher.start()
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParseRequestOptions:
|
||||
files: list[UploadFile]
|
||||
@@ -202,14 +229,19 @@ def create_app():
|
||||
)
|
||||
|
||||
global _request_semaphore, _configured_max_concurrent_requests
|
||||
max_concurrent_requests = read_max_concurrent_requests(
|
||||
default=DEFAULT_MAX_CONCURRENT_REQUESTS
|
||||
)
|
||||
|
||||
if is_mac_environment():
|
||||
max_concurrent_requests = 1
|
||||
else:
|
||||
max_concurrent_requests = read_max_concurrent_requests(
|
||||
default=DEFAULT_MAX_CONCURRENT_REQUESTS
|
||||
)
|
||||
|
||||
_configured_max_concurrent_requests = max_concurrent_requests
|
||||
app.state.max_concurrent_requests = max_concurrent_requests
|
||||
_request_semaphore = asyncio.Semaphore(max_concurrent_requests)
|
||||
logger.info(f"Request concurrency limited to {max_concurrent_requests}")
|
||||
if is_main_multiprocessing_process():
|
||||
logger.info(f"Request concurrency limited to {max_concurrent_requests}")
|
||||
|
||||
app.add_middleware(GZipMiddleware, minimum_size=1000)
|
||||
return app
|
||||
@@ -854,32 +886,12 @@ async def run_parse_job(
|
||||
)
|
||||
|
||||
if request_options.backend == "pipeline":
|
||||
async with serialize_parse_job_if_needed(request_options.backend):
|
||||
await asyncio.to_thread(do_parse, **parse_kwargs)
|
||||
await asyncio.to_thread(do_parse, **parse_kwargs)
|
||||
else:
|
||||
async with serialize_parse_job_if_needed(request_options.backend):
|
||||
await aio_do_parse(**parse_kwargs)
|
||||
await aio_do_parse(**parse_kwargs)
|
||||
return response_file_names
|
||||
|
||||
|
||||
def should_serialize_parse_job(backend: str) -> bool:
|
||||
if get_device() != "mps":
|
||||
return False
|
||||
return backend == "pipeline" or backend.startswith(("vlm-", "hybrid-"))
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def serialize_parse_job_if_needed(backend: str):
|
||||
if not should_serialize_parse_job(backend):
|
||||
yield
|
||||
return
|
||||
await asyncio.to_thread(_mps_parse_lock.acquire)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
_mps_parse_lock.release()
|
||||
|
||||
|
||||
def create_task_output_dir(task_id: str) -> str:
|
||||
output_root = get_output_root()
|
||||
task_output_dir = output_root / task_id
|
||||
@@ -1422,7 +1434,16 @@ def main(ctx, host, port, reload, **kwargs):
|
||||
access_log=access_log,
|
||||
)
|
||||
else:
|
||||
uvicorn.run(app, host=host, port=port, reload=False, access_log=access_log)
|
||||
config = uvicorn.Config(
|
||||
app,
|
||||
host=host,
|
||||
port=port,
|
||||
reload=False,
|
||||
access_log=access_log,
|
||||
)
|
||||
server = uvicorn.Server(config)
|
||||
install_stdin_shutdown_watcher(server)
|
||||
server.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -8,9 +8,49 @@ DEFAULT_LANG = "txt"
|
||||
PDF_SIG_BYTES = b'%PDF'
|
||||
magika = Magika()
|
||||
|
||||
def _normalize_text_for_language_guess(code: str) -> str:
|
||||
if not code:
|
||||
return ""
|
||||
|
||||
normalized = []
|
||||
index = 0
|
||||
while index < len(code):
|
||||
current_char = code[index]
|
||||
current_ord = ord(current_char)
|
||||
|
||||
if 0xD800 <= current_ord <= 0xDBFF:
|
||||
if index + 1 < len(code):
|
||||
next_char = code[index + 1]
|
||||
next_ord = ord(next_char)
|
||||
if 0xDC00 <= next_ord <= 0xDFFF:
|
||||
pair = current_char + next_char
|
||||
normalized.append(pair.encode("utf-16", "surrogatepass").decode("utf-16"))
|
||||
index += 2
|
||||
continue
|
||||
index += 1
|
||||
continue
|
||||
|
||||
if 0xDC00 <= current_ord <= 0xDFFF:
|
||||
index += 1
|
||||
continue
|
||||
|
||||
normalized.append(current_char)
|
||||
index += 1
|
||||
|
||||
return "".join(normalized)
|
||||
|
||||
|
||||
def guess_language_by_text(code):
|
||||
codebytes = code.encode(encoding="utf-8")
|
||||
lang = magika.identify_bytes(codebytes).prediction.output.label
|
||||
normalized_code = _normalize_text_for_language_guess(code)
|
||||
if not normalized_code:
|
||||
return DEFAULT_LANG
|
||||
|
||||
try:
|
||||
codebytes = normalized_code.encode("utf-8", errors="replace")
|
||||
lang = magika.identify_bytes(codebytes).prediction.output.label
|
||||
except Exception:
|
||||
return DEFAULT_LANG
|
||||
|
||||
return lang if lang != "unknown" else DEFAULT_LANG
|
||||
|
||||
|
||||
@@ -32,4 +72,4 @@ def guess_suffix_by_path(file_path) -> str:
|
||||
suffix = "pdf"
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read file {file_path} for PDF signature check: {e}")
|
||||
return suffix
|
||||
return suffix
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import atexit
|
||||
import multiprocessing
|
||||
import os
|
||||
import signal
|
||||
import threading
|
||||
import time
|
||||
from io import BytesIO
|
||||
|
||||
@@ -25,6 +28,7 @@ from mineru.utils.pdfium_guard import (
|
||||
)
|
||||
|
||||
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
|
||||
from concurrent.futures.process import BrokenProcessPool
|
||||
|
||||
|
||||
DEFAULT_PDF_IMAGE_DPI = 200
|
||||
@@ -32,6 +36,9 @@ DEFAULT_PDF_IMAGE_DPI = 200
|
||||
MAX_PDF_RENDER_PROCESSES = 4
|
||||
MIN_PAGES_PER_RENDER_PROCESS = 30
|
||||
|
||||
_pdf_render_executor: ProcessPoolExecutor | None = None
|
||||
_pdf_render_executor_lock = threading.Lock()
|
||||
|
||||
|
||||
def pdf_page_to_image(
|
||||
page: pdfium.PdfPage,
|
||||
@@ -113,6 +120,79 @@ def _get_render_process_plan(
|
||||
)
|
||||
|
||||
|
||||
def _get_pdf_render_pool_capacity(cpu_count=None) -> int:
|
||||
available_cpus = max(1, cpu_count if cpu_count is not None else (os.cpu_count() or 1))
|
||||
configured_threads = max(1, get_load_images_threads())
|
||||
return min(
|
||||
available_cpus,
|
||||
configured_threads,
|
||||
MAX_PDF_RENDER_PROCESSES,
|
||||
)
|
||||
|
||||
|
||||
def _create_pdf_render_executor(max_workers: int) -> ProcessPoolExecutor:
|
||||
if is_windows_environment():
|
||||
return ProcessPoolExecutor(max_workers=max_workers)
|
||||
|
||||
start_method = multiprocessing.get_start_method()
|
||||
if start_method == "fork":
|
||||
logger.debug(
|
||||
"PDF image rendering switches multiprocessing start method from fork to spawn"
|
||||
)
|
||||
return ProcessPoolExecutor(
|
||||
max_workers=max_workers,
|
||||
mp_context=multiprocessing.get_context("spawn"),
|
||||
)
|
||||
|
||||
return ProcessPoolExecutor(max_workers=max_workers)
|
||||
|
||||
|
||||
def _get_pdf_render_executor() -> ProcessPoolExecutor:
|
||||
global _pdf_render_executor
|
||||
|
||||
with _pdf_render_executor_lock:
|
||||
if _pdf_render_executor is None:
|
||||
max_workers = _get_pdf_render_pool_capacity()
|
||||
_pdf_render_executor = _create_pdf_render_executor(max_workers=max_workers)
|
||||
logger.debug(
|
||||
f"Created persistent PDF render executor with max_workers={max_workers}"
|
||||
)
|
||||
return _pdf_render_executor
|
||||
|
||||
|
||||
def _recycle_pdf_render_executor(
|
||||
executor: ProcessPoolExecutor | None,
|
||||
*,
|
||||
terminate_processes: bool,
|
||||
) -> None:
|
||||
global _pdf_render_executor
|
||||
|
||||
if executor is None:
|
||||
return
|
||||
|
||||
with _pdf_render_executor_lock:
|
||||
if _pdf_render_executor is executor:
|
||||
_pdf_render_executor = None
|
||||
|
||||
if terminate_processes:
|
||||
_terminate_executor_processes(executor)
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
|
||||
|
||||
def shutdown_pdf_render_executor() -> None:
|
||||
global _pdf_render_executor
|
||||
|
||||
with _pdf_render_executor_lock:
|
||||
executor = _pdf_render_executor
|
||||
_pdf_render_executor = None
|
||||
|
||||
if executor is not None:
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
|
||||
|
||||
atexit.register(shutdown_pdf_render_executor)
|
||||
|
||||
|
||||
def _load_images_from_pdf_bytes_range(
|
||||
pdf_bytes: bytes,
|
||||
dpi=DEFAULT_PDF_IMAGE_DPI,
|
||||
@@ -141,7 +221,8 @@ def _load_images_from_pdf_bytes_range(
|
||||
f"{start_page_id + 1}-{end_page_id + 1}: {page_ranges}"
|
||||
)
|
||||
|
||||
executor = ProcessPoolExecutor(max_workers=actual_threads)
|
||||
executor = _get_pdf_render_executor()
|
||||
recycle_executor = False
|
||||
try:
|
||||
futures = []
|
||||
future_to_range = {}
|
||||
@@ -159,7 +240,7 @@ def _load_images_from_pdf_bytes_range(
|
||||
|
||||
_, not_done = wait(futures, timeout=timeout, return_when=ALL_COMPLETED)
|
||||
if not_done:
|
||||
_terminate_executor_processes(executor)
|
||||
recycle_executor = True
|
||||
raise TimeoutError(
|
||||
f"PDF image rendering timeout after {timeout}s "
|
||||
f"for pages {start_page_id + 1}-{end_page_id + 1}"
|
||||
@@ -177,12 +258,16 @@ def _load_images_from_pdf_bytes_range(
|
||||
images_list.extend(imgs)
|
||||
|
||||
return images_list
|
||||
except Exception as exc:
|
||||
if not isinstance(exc, TimeoutError):
|
||||
_terminate_executor_processes(executor)
|
||||
except BrokenProcessPool:
|
||||
recycle_executor = True
|
||||
raise
|
||||
finally:
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
if recycle_executor:
|
||||
logger.warning("Recycling persistent PDF render executor after render failure")
|
||||
_recycle_pdf_render_executor(
|
||||
executor,
|
||||
terminate_processes=True,
|
||||
)
|
||||
|
||||
|
||||
def _terminate_executor_processes(executor):
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.7.6"
|
||||
__version__ = "3.0.3"
|
||||
|
||||
Reference in New Issue
Block a user