Compare commits

...

30 Commits

Author SHA1 Message Date
myhloli
ff2caf3a84 Update version.py with new version 2026-03-30 12:23:38 +00:00
Xiaomeng Zhao
be675a5a97 Merge pull request #4689 from opendatalab/dev
3.0.3
2026-03-30 20:20:43 +08:00
Xiaomeng Zhao
fa5f8d61f9 Merge pull request #4688 from myhloli/dev
feat: enhance PDF rendering with persistent executor and recycling logic
2026-03-30 20:19:51 +08:00
myhloli
a6f250b89c feat: enhance PDF rendering with persistent executor and recycling logic 2026-03-30 20:08:56 +08:00
myhloli
2bda24815a feat: implement custom PDF render executor for improved multiprocessing handling 2026-03-30 19:15:22 +08:00
myhloli
980670a649 Merge remote-tracking branch 'origin/dev' into dev 2026-03-30 18:49:58 +08:00
myhloli
84d1384c2b feat: improve task status polling with timeout handling and logging 2026-03-30 18:49:51 +08:00
Xiaomeng Zhao
de8ea317ba Merge pull request #4687 from opendatalab/master
master->dev
2026-03-30 17:31:41 +08:00
myhloli
d0d2bf920b Update version.py with new version 2026-03-30 09:29:31 +00:00
Xiaomeng Zhao
6a5a2efc50 Merge pull request #4686 from opendatalab/dev
3.0.2
2026-03-30 17:26:41 +08:00
Xiaomeng Zhao
1cfa71be1c Merge pull request #4685 from myhloli/dev
feat: enhance language guessing by normalizing text for surrogate pairs
2026-03-30 17:22:33 +08:00
myhloli
66df0cd367 feat: enhance language guessing by normalizing text for surrogate pairs 2026-03-30 17:21:57 +08:00
Xiaomeng Zhao
9063a4133d Merge pull request #4684 from myhloli/dev
Dev
2026-03-30 15:33:43 +08:00
myhloli
b85d18dca1 feat: update hybrid-http-client description to clarify local computing power requirements 2026-03-30 15:32:17 +08:00
myhloli
016169ce2e feat: add backend dependency checks for hybrid clients and update documentation 2026-03-30 15:26:03 +08:00
myhloli
c635892b75 feat: adjust max concurrent requests based on macOS environment detection 2026-03-30 14:34:59 +08:00
myhloli
d9778814fa feat: simplify parsing logic by removing unnecessary serialization and device checks 2026-03-30 12:06:52 +08:00
myhloli
62054c9528 feat: implement stdin shutdown watcher and enhance local API shutdown handling 2026-03-30 11:29:45 +08:00
myhloli
de77fc3af5 feat: add mineru-router service to compose.yaml and update docker_deployment.md for improved deployment instructions 2026-03-30 10:53:54 +08:00
myhloli
f913df4ec6 feat: remove commented-out data-parallel-size option from compose.yaml for clarity 2026-03-30 10:44:30 +08:00
Xiaomeng Zhao
264c594b23 Merge pull request #4677 from opendatalab/master
master->dev
2026-03-29 13:33:19 +08:00
myhloli
a6b6d3081c Update version.py with new version 2026-03-29 05:28:26 +00:00
Xiaomeng Zhao
1f82300d1d Merge pull request #4676 from opendatalab/dev
Dev
2026-03-29 13:24:20 +08:00
Xiaomeng Zhao
2d4fa2cc8e Merge pull request #4675 from myhloli/dev
feat: refactor OCR processing to improve span handling and reduce cod…
2026-03-29 13:23:04 +08:00
myhloli
71b9e9f780 feat: refactor OCR processing to improve span handling and reduce code duplication 2026-03-29 13:21:13 +08:00
Xiaomeng Zhao
5e51ab2934 Merge pull request #4674 from opendatalab/master
master->dev
2026-03-29 04:18:37 +08:00
Xiaomeng Zhao
520c61faaf Merge pull request #4673 from myhloli/dev
Dev
2026-03-29 04:17:57 +08:00
myhloli
72601b314a feat: update minimum hardware requirements in index.md for clarity and accuracy 2026-03-29 04:14:24 +08:00
myhloli
e54c67dcd1 feat: update minimum hardware requirements in index.md for clarity and accuracy 2026-03-29 04:13:48 +08:00
myhloli
33e4fbd694 Update version.py with new version 2026-03-28 20:11:08 +00:00
16 changed files with 375 additions and 91 deletions

View File

@@ -12,8 +12,7 @@ services:
command:
--host 0.0.0.0
--port 30000
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
# --gpu-memory-utilization 0.5 # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
ulimits:
memlock: -1
stack: 67108864
@@ -42,8 +41,7 @@ services:
--host 0.0.0.0
--port 8000
# parameters for vllm-engine
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
# --gpu-memory-utilization 0.5 # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
ulimits:
memlock: -1
stack: 67108864
@@ -58,6 +56,40 @@ services:
device_ids: ["0"] # Modify for multiple GPUs: ["0", "1"]
capabilities: [gpu]
mineru-router:
image: mineru:latest
container_name: mineru-router
restart: always
profiles: ["router"]
ports:
- 8002:8002
environment:
MINERU_MODEL_SOURCE: local
entrypoint: mineru-router
command:
--host 0.0.0.0
--port 8002
--local-gpus auto
# To aggregate existing mineru-api services instead of starting local workers:
# --local-gpus none
# --upstream-url http://mineru-api:8000
# --upstream-url http://mineru-api-2:8000
# parameters for vllm-engine
# --gpu-memory-utilization 0.5 # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
ulimits:
memlock: -1
stack: 67108864
ipc: host
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:8002/health || exit 1"]
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0"] # Modify for multiple GPUs: ["0", "1"]
capabilities: [gpu]
mineru-gradio:
image: mineru:latest
container_name: mineru-gradio
@@ -74,8 +106,7 @@ services:
# --enable-api false # If you want to disable the API, set this to false
# --max-convert-pages 20 # If you want to limit the number of pages for conversion, set this to a specific number
# parameters for vllm-engine
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
# --gpu-memory-utilization 0.5 # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
ulimits:
memlock: -1
stack: 67108864

View File

@@ -25,7 +25,7 @@ MinerU's Docker uses `vllm/vllm-openai` as the base image, so it includes the `v
```bash
docker run --gpus all \
--shm-size 32g \
-p 30000:30000 -p 7860:7860 -p 8000:8000 \
-p 30000:30000 -p 7860:7860 -p 8000:8000 -p 8002:8002 \
--ipc=host \
-it mineru:latest \
/bin/bash
@@ -73,6 +73,17 @@ connect to `openai-server` via `vlm-http-client` backend
---
### Start MinerU Router service
```bash
docker compose -f compose.yaml --profile router up -d
```
>[!TIP]
>
>- The default configuration runs in `--local-gpus auto` mode, automatically starting local workers in the container and exposing the unified entry at `http://<server_ip>:8002/docs`.
>- If you want to aggregate existing `mineru-api` services instead of starting local workers, refer to the commented example under the `mineru-router` service in `compose.yaml` and switch to `--upstream-url`.
---
### Start Gradio WebUI service
```bash
docker compose -f compose.yaml --profile gradio up -d

View File

@@ -50,7 +50,7 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
</tr>
<tr>
<th>Accuracy<sup>1</sup></th>
<td style="text-align:center;">82+</td>
<td style="text-align:center;">86+</td>
<td colspan="4" style="text-align:center;">90+</td>
</tr>
<tr>
@@ -70,15 +70,15 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
</tr>
<tr>
<th>Min VRAM</th>
<td style="text-align:center;">6GB</td>
<td style="text-align:center;">10GB</td>
<td style="text-align:center;">4GB</td>
<td style="text-align:center;">8GB</td>
<td style="text-align:center;">3GB</td>
<td style="text-align:center;">8GB</td>
<td style="text-align:center;">2GB</td>
</tr>
<tr>
<th>RAM</th>
<td colspan="3" style="text-align:center;">Min 16GB+, Recommended 32GB+</td>
<td colspan="2" style="text-align:center;">8GB</td>
<td colspan="2" style="text-align:center;">16GB</td>
</tr>
<tr>
<th>Disk Space</th>

View File

@@ -104,6 +104,8 @@ If you need to adjust parsing options through custom parameters, you can also ch
> ```bash
> mineru -p <input_path> -o <output_path> -b hybrid-http-client -u http://127.0.0.1:30000
> ```
>`vlm-http-client` is the lightweight remote client option and does not require local `torch`.
>`hybrid-http-client` requires local pipeline dependencies such as `mineru[pipeline]` and `torch`.
> [!NOTE]
> All officially supported `vllm/lmdeploy` parameters can be passed to MinerU through command line arguments, including the following commands: `mineru`, `mineru-openai-server`, `mineru-gradio`, `mineru-api`, `mineru-router`.

View File

@@ -25,7 +25,7 @@ Mineru的docker使用了`vllm/vllm-openai`作为基础镜像因此在docker
```bash
docker run --gpus all \
--shm-size 32g \
-p 30000:30000 -p 7860:7860 -p 8000:8000 \
-p 30000:30000 -p 7860:7860 -p 8000:8000 -p 8002:8002 \
--ipc=host \
-it mineru:latest \
/bin/bash
@@ -72,10 +72,21 @@ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/compose.yaml
---
### 启动 MinerU Router 服务
```bash
docker compose -f compose.yaml --profile router up -d
```
>[!TIP]
>
>- 默认配置会以 `--local-gpus auto` 模式在容器内自动拉起本地 worker并通过 `http://<server_ip>:8002/docs` 暴露统一入口。
>- 如果您希望聚合已有的 `mineru-api` 服务而不是启动本地 worker可直接参考 `compose.yaml` 中 `mineru-router` 服务下的注释示例,改为使用 `--upstream-url`。
---
### 启动 Gradio WebUI 服务
```bash
docker compose -f compose.yaml --profile gradio up -d
```
>[!TIP]
>
>- 在浏览器中访问 `http://<server_ip>:7860` 使用 Gradio WebUI。
>- 在浏览器中访问 `http://<server_ip>:7860` 使用 Gradio WebUI。

View File

@@ -50,7 +50,7 @@
</tr>
<tr>
<th>精度指标<sup>1</sup></th>
<td style="text-align:center;">82+</td>
<td style="text-align:center;">86+</td>
<td colspan="4" style="text-align:center;">90+</td>
</tr>
<tr>
@@ -70,15 +70,15 @@
</tr>
<tr>
<th>显存最低要求</th>
<td style="text-align:center;">6GB</td>
<td style="text-align:center;">10GB</td>
<td style="text-align:center;">4GB</td>
<td style="text-align:center;">8GB</td>
<td style="text-align:center;">3GB</td>
<td style="text-align:center;">8GB</td>
<td style="text-align:center;">2GB</td>
</tr>
<tr>
<th>内存要求</th>
<td colspan="3" style="text-align:center;">最低16GB以上,推荐32GB以上</td>
<td colspan="2" style="text-align:center;">8GB</td>
<td colspan="2" style="text-align:center;">16GB</td>
</tr>
<tr>
<th>磁盘空间要求</th>

View File

@@ -104,6 +104,8 @@ mineru -p <input_path> -o <output_path>
> ```bash
> mineru -p <input_path> -o <output_path> -b hybrid-http-client -u http://127.0.0.1:30000
> ```
>`vlm-http-client` 是轻量远程 client用法上不要求本地安装 `torch`。
>`hybrid-http-client` 需要本地具备 `mineru[pipeline]` 及 `torch` 等 pipeline 依赖。
> [!NOTE]
> 所有`vllm/lmdeploy`官方支持的参数都可用通过命令行参数传递给 MinerU包括以下命令:`mineru`、`mineru-openai-server`、`mineru-gradio`、`mineru-api`、`mineru-router`

View File

@@ -131,23 +131,27 @@ def blocks_to_page_info(
return page_info
def _iter_block_spans(block):
for line in block.get("lines", []):
for span in line.get("spans", []):
yield span
for sub_block in block.get("blocks", []):
yield from _iter_block_spans(sub_block)
def _apply_post_ocr(pdf_info_list, hybrid_pipeline_model):
need_ocr_list = []
img_crop_list = []
text_block_list = []
for page_info in pdf_info_list:
for block in page_info['para_blocks']:
if block['type'] in ['table', 'image', 'list', 'code']:
for sub_block in block['blocks']:
if not sub_block['type'].endswith('body'):
text_block_list.append(sub_block)
elif block['type'] in ['text', 'title', 'ref_text']:
text_block_list.append(block)
for block in page_info['discarded_blocks']:
text_block_list.append(block)
for block in text_block_list:
for line in block['lines']:
for span in line['spans']:
for block in page_info.get('para_blocks', []):
for span in _iter_block_spans(block):
if 'np_img' in span:
need_ocr_list.append(span)
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
span.pop('np_img')
for block in page_info.get('discarded_blocks', []):
for span in _iter_block_spans(block):
if 'np_img' in span:
need_ocr_list.append(span)
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))

View File

@@ -203,6 +203,15 @@ def _extract_text_from_block(block):
return "".join(text_parts).strip()
def _iter_block_spans(block):
for line in block.get("lines", []):
for span in line.get("spans", []):
yield span
for sub_block in block.get("blocks", []):
yield from _iter_block_spans(sub_block)
def _normalize_formula_tag_content(tag_content):
tag_content = full_to_half(tag_content.strip())
if tag_content.startswith("("):
@@ -261,22 +270,18 @@ def _optimize_formula_number_blocks(pdf_info_list):
def _apply_post_ocr(pdf_info_list, lang=None):
need_ocr_list = []
img_crop_list = []
text_block_list = []
for page_info in pdf_info_list:
for block in page_info['preproc_blocks']:
if 'blocks' in block:
for sub_block in block['blocks']:
if sub_block.get("type", "").endswith('caption') or sub_block.get("type", "").endswith('footnote'):
text_block_list.append(sub_block)
elif block["type"] not in [BlockType.INTERLINE_EQUATION, BlockType.SEAL]:
text_block_list.append(block)
for block in page_info['discarded_blocks']:
text_block_list.append(block)
for block in page_info.get('preproc_blocks', []):
for span in _iter_block_spans(block):
if 'np_img' in span:
need_ocr_list.append(span)
# Keep post-OCR rec aligned with the main OCR pipeline for vertical tall crops.
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
span.pop('np_img')
for block in text_block_list:
for line in block['lines']:
for span in line['spans']:
for block in page_info.get('discarded_blocks', []):
for span in _iter_block_spans(block):
if 'np_img' in span:
need_ocr_list.append(span)
# Keep post-OCR rec aligned with the main OCR pipeline for vertical tall crops.

View File

@@ -32,6 +32,7 @@ TASKS_ENDPOINT = "/tasks"
TASK_STATUS_POLL_INTERVAL_SECONDS = 1.0
TASK_RESULT_TIMEOUT_SECONDS = 3600
LOCAL_API_STARTUP_TIMEOUT_SECONDS = 30
LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS = 10
LOCAL_API_CLEANUP_RETRIES = 8
LOCAL_API_CLEANUP_RETRY_INTERVAL_SECONDS = 0.25
@@ -87,6 +88,7 @@ class LocalAPIServer:
read_max_concurrent_requests(default=DEFAULT_MAX_CONCURRENT_REQUESTS)
)
env["MINERU_API_DISABLE_ACCESS_LOG"] = "1"
env["MINERU_API_SHUTDOWN_ON_STDIN_EOF"] = "1"
self.output_root.mkdir(parents=True, exist_ok=True)
command = [
@@ -103,6 +105,7 @@ class LocalAPIServer:
command,
cwd=os.getcwd(),
env=env,
stdin=subprocess.PIPE,
)
if not self._atexit_registered:
@@ -115,12 +118,23 @@ class LocalAPIServer:
self.process = None
try:
if process is not None and process.poll() is None:
process.terminate()
if process.stdin is not None and not process.stdin.closed:
process.stdin.close()
try:
process.wait(timeout=5)
process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
except subprocess.TimeoutExpired:
logger.debug(
"Local mineru-api did not stop after stdin EOF within {}s. Falling back to SIGTERM.",
LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS,
)
process.terminate()
try:
process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
return
except subprocess.TimeoutExpired:
pass
process.kill()
process.wait(timeout=5)
process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
finally:
if self._atexit_registered:
try:
@@ -475,7 +489,17 @@ async def wait_for_task_result(
) -> None:
deadline = asyncio.get_running_loop().time() + timeout_seconds
while asyncio.get_running_loop().time() < deadline:
response = await client.get(submit_response.status_url)
try:
response = await client.get(submit_response.status_url)
except httpx.ReadTimeout:
logger.warning(
"Timed out while polling task status for {} (task_id={}). "
"This can happen during cold start; retrying until the task deadline.",
task_label,
submit_response.task_id,
)
await asyncio.sleep(TASK_STATUS_POLL_INTERVAL_SECONDS)
continue
if response.status_code != 200:
raise click.ClickException(
f"Failed to query task status for {task_label}: "

View File

@@ -30,6 +30,8 @@ from mineru.utils.pdfium_guard import (
from mineru.version import __version__
from mineru.cli.common import (
HybridDependencyError,
ensure_backend_dependencies,
image_suffixes,
office_suffixes,
pdf_suffixes,
@@ -847,6 +849,11 @@ async def run_orchestrated_cli(
raise click.ClickException("--start must be greater than or equal to 0")
if end_page_id is not None and end_page_id < 0:
raise click.ClickException("--end must be greater than or equal to 0")
if api_url is None:
try:
ensure_backend_dependencies(backend)
except HybridDependencyError as exc:
raise click.ClickException(str(exc)) from exc
output_dir.mkdir(parents=True, exist_ok=True)
documents = collect_input_documents(

View File

@@ -1,4 +1,6 @@
# Copyright (c) Opendatalab. All rights reserved.
import importlib
import importlib.util
import json
import os
from concurrent.futures import ThreadPoolExecutor
@@ -40,6 +42,37 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
MAX_TASK_STEM_BYTES = 200
class HybridDependencyError(RuntimeError):
pass
def build_hybrid_dependency_error_message(backend: str) -> str:
return (
f"`{backend}` requires local pipeline dependencies (`mineru[pipeline]`, "
"including `torch`). Install `mineru[pipeline]` or `mineru[core]`. "
"If you need a lightweight remote client without local `torch`, "
"use `vlm-http-client` instead."
)
def ensure_backend_dependencies(backend: str) -> None:
if not backend.startswith("hybrid-"):
return
if importlib.util.find_spec("torch") is None:
raise HybridDependencyError(build_hybrid_dependency_error_message(backend))
def _load_hybrid_analyze_entrypoint(entrypoint_name: str, backend: str):
ensure_backend_dependencies(backend)
try:
hybrid_analyze = importlib.import_module("mineru.backend.hybrid.hybrid_analyze")
except (ImportError, ModuleNotFoundError) as exc:
raise HybridDependencyError(
build_hybrid_dependency_error_message(backend)
) from exc
return getattr(hybrid_analyze, entrypoint_name)
def utf8_byte_length(value: str) -> int:
return len(value.encode("utf-8"))
@@ -438,7 +471,10 @@ def _process_hybrid(
server_url=None,
**kwargs,
):
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
hybrid_doc_analyze = _load_hybrid_analyze_entrypoint(
"doc_analyze",
f"hybrid-{backend}",
)
"""同步处理hybrid后端逻辑"""
if not backend.endswith("client"):
server_url = None
@@ -491,7 +527,10 @@ async def _async_process_hybrid(
server_url=None,
**kwargs,
):
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
aio_hybrid_doc_analyze = _load_hybrid_analyze_entrypoint(
"aio_doc_analyze",
f"hybrid-{backend}",
)
"""异步处理hybrid后端逻辑"""
if not backend.endswith("client"):
server_url = None
@@ -642,6 +681,7 @@ def do_parse(
server_url, **kwargs,
)
elif backend.startswith("hybrid-"):
ensure_backend_dependencies(backend)
backend = backend[7:]
if backend == "vllm-async-engine":
@@ -734,6 +774,7 @@ async def aio_do_parse(
server_url, **kwargs,
)
elif backend.startswith("hybrid-"):
ensure_backend_dependencies(backend)
backend = backend[7:]
if backend == "vllm-engine":

View File

@@ -1,5 +1,6 @@
import asyncio
import mimetypes
import multiprocessing
import os
import shutil
import sys
@@ -49,8 +50,8 @@ from mineru.cli.api_protocol import (
DEFAULT_PROCESSING_WINDOW_SIZE,
)
from mineru.utils.cli_parser import arg_parse
from mineru.utils.check_sys_env import is_mac_environment
from mineru.utils.config_reader import (
get_device,
get_max_concurrent_requests as read_max_concurrent_requests,
get_processing_window_size,
)
@@ -79,8 +80,7 @@ FILE_PARSE_TASK_RESULT_URL_HEADER = "X-MinerU-Task-Result-Url"
# 并发控制器
_request_semaphore: Optional[asyncio.Semaphore] = None
_configured_max_concurrent_requests = 0
_mps_parse_lock = threading.Lock()
_configured_max_concurrent_requests = 1
def env_flag_enabled(name: str, default: bool = False) -> bool:
@@ -90,6 +90,33 @@ def env_flag_enabled(name: str, default: bool = False) -> bool:
return value.lower() in ("1", "true", "yes", "on")
def is_main_multiprocessing_process() -> bool:
try:
return multiprocessing.current_process().name == "MainProcess"
except Exception:
return True
def install_stdin_shutdown_watcher(server: uvicorn.Server) -> None:
if not env_flag_enabled("MINERU_API_SHUTDOWN_ON_STDIN_EOF"):
return
def _watch_stdin_for_eof() -> None:
stdin_stream = getattr(sys.stdin, "buffer", sys.stdin)
try:
stdin_stream.read()
except Exception:
return
server.should_exit = True
watcher = threading.Thread(
target=_watch_stdin_for_eof,
name="mineru-api-stdin-shutdown",
daemon=True,
)
watcher.start()
@dataclass
class ParseRequestOptions:
files: list[UploadFile]
@@ -202,14 +229,19 @@ def create_app():
)
global _request_semaphore, _configured_max_concurrent_requests
max_concurrent_requests = read_max_concurrent_requests(
default=DEFAULT_MAX_CONCURRENT_REQUESTS
)
if is_mac_environment():
max_concurrent_requests = 1
else:
max_concurrent_requests = read_max_concurrent_requests(
default=DEFAULT_MAX_CONCURRENT_REQUESTS
)
_configured_max_concurrent_requests = max_concurrent_requests
app.state.max_concurrent_requests = max_concurrent_requests
_request_semaphore = asyncio.Semaphore(max_concurrent_requests)
logger.info(f"Request concurrency limited to {max_concurrent_requests}")
if is_main_multiprocessing_process():
logger.info(f"Request concurrency limited to {max_concurrent_requests}")
app.add_middleware(GZipMiddleware, minimum_size=1000)
return app
@@ -854,32 +886,12 @@ async def run_parse_job(
)
if request_options.backend == "pipeline":
async with serialize_parse_job_if_needed(request_options.backend):
await asyncio.to_thread(do_parse, **parse_kwargs)
await asyncio.to_thread(do_parse, **parse_kwargs)
else:
async with serialize_parse_job_if_needed(request_options.backend):
await aio_do_parse(**parse_kwargs)
await aio_do_parse(**parse_kwargs)
return response_file_names
def should_serialize_parse_job(backend: str) -> bool:
if get_device() != "mps":
return False
return backend == "pipeline" or backend.startswith(("vlm-", "hybrid-"))
@asynccontextmanager
async def serialize_parse_job_if_needed(backend: str):
if not should_serialize_parse_job(backend):
yield
return
await asyncio.to_thread(_mps_parse_lock.acquire)
try:
yield
finally:
_mps_parse_lock.release()
def create_task_output_dir(task_id: str) -> str:
output_root = get_output_root()
task_output_dir = output_root / task_id
@@ -1422,7 +1434,16 @@ def main(ctx, host, port, reload, **kwargs):
access_log=access_log,
)
else:
uvicorn.run(app, host=host, port=port, reload=False, access_log=access_log)
config = uvicorn.Config(
app,
host=host,
port=port,
reload=False,
access_log=access_log,
)
server = uvicorn.Server(config)
install_stdin_shutdown_watcher(server)
server.run()
if __name__ == "__main__":

View File

@@ -8,9 +8,49 @@ DEFAULT_LANG = "txt"
PDF_SIG_BYTES = b'%PDF'
magika = Magika()
def _normalize_text_for_language_guess(code: str) -> str:
if not code:
return ""
normalized = []
index = 0
while index < len(code):
current_char = code[index]
current_ord = ord(current_char)
if 0xD800 <= current_ord <= 0xDBFF:
if index + 1 < len(code):
next_char = code[index + 1]
next_ord = ord(next_char)
if 0xDC00 <= next_ord <= 0xDFFF:
pair = current_char + next_char
normalized.append(pair.encode("utf-16", "surrogatepass").decode("utf-16"))
index += 2
continue
index += 1
continue
if 0xDC00 <= current_ord <= 0xDFFF:
index += 1
continue
normalized.append(current_char)
index += 1
return "".join(normalized)
def guess_language_by_text(code):
codebytes = code.encode(encoding="utf-8")
lang = magika.identify_bytes(codebytes).prediction.output.label
normalized_code = _normalize_text_for_language_guess(code)
if not normalized_code:
return DEFAULT_LANG
try:
codebytes = normalized_code.encode("utf-8", errors="replace")
lang = magika.identify_bytes(codebytes).prediction.output.label
except Exception:
return DEFAULT_LANG
return lang if lang != "unknown" else DEFAULT_LANG
@@ -32,4 +72,4 @@ def guess_suffix_by_path(file_path) -> str:
suffix = "pdf"
except Exception as e:
logger.warning(f"Failed to read file {file_path} for PDF signature check: {e}")
return suffix
return suffix

View File

@@ -1,6 +1,9 @@
# Copyright (c) Opendatalab. All rights reserved.
import atexit
import multiprocessing
import os
import signal
import threading
import time
from io import BytesIO
@@ -25,6 +28,7 @@ from mineru.utils.pdfium_guard import (
)
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
from concurrent.futures.process import BrokenProcessPool
DEFAULT_PDF_IMAGE_DPI = 200
@@ -32,6 +36,9 @@ DEFAULT_PDF_IMAGE_DPI = 200
MAX_PDF_RENDER_PROCESSES = 4
MIN_PAGES_PER_RENDER_PROCESS = 30
_pdf_render_executor: ProcessPoolExecutor | None = None
_pdf_render_executor_lock = threading.Lock()
def pdf_page_to_image(
page: pdfium.PdfPage,
@@ -113,6 +120,79 @@ def _get_render_process_plan(
)
def _get_pdf_render_pool_capacity(cpu_count=None) -> int:
available_cpus = max(1, cpu_count if cpu_count is not None else (os.cpu_count() or 1))
configured_threads = max(1, get_load_images_threads())
return min(
available_cpus,
configured_threads,
MAX_PDF_RENDER_PROCESSES,
)
def _create_pdf_render_executor(max_workers: int) -> ProcessPoolExecutor:
if is_windows_environment():
return ProcessPoolExecutor(max_workers=max_workers)
start_method = multiprocessing.get_start_method()
if start_method == "fork":
logger.debug(
"PDF image rendering switches multiprocessing start method from fork to spawn"
)
return ProcessPoolExecutor(
max_workers=max_workers,
mp_context=multiprocessing.get_context("spawn"),
)
return ProcessPoolExecutor(max_workers=max_workers)
def _get_pdf_render_executor() -> ProcessPoolExecutor:
global _pdf_render_executor
with _pdf_render_executor_lock:
if _pdf_render_executor is None:
max_workers = _get_pdf_render_pool_capacity()
_pdf_render_executor = _create_pdf_render_executor(max_workers=max_workers)
logger.debug(
f"Created persistent PDF render executor with max_workers={max_workers}"
)
return _pdf_render_executor
def _recycle_pdf_render_executor(
executor: ProcessPoolExecutor | None,
*,
terminate_processes: bool,
) -> None:
global _pdf_render_executor
if executor is None:
return
with _pdf_render_executor_lock:
if _pdf_render_executor is executor:
_pdf_render_executor = None
if terminate_processes:
_terminate_executor_processes(executor)
executor.shutdown(wait=False, cancel_futures=True)
def shutdown_pdf_render_executor() -> None:
global _pdf_render_executor
with _pdf_render_executor_lock:
executor = _pdf_render_executor
_pdf_render_executor = None
if executor is not None:
executor.shutdown(wait=False, cancel_futures=True)
atexit.register(shutdown_pdf_render_executor)
def _load_images_from_pdf_bytes_range(
pdf_bytes: bytes,
dpi=DEFAULT_PDF_IMAGE_DPI,
@@ -141,7 +221,8 @@ def _load_images_from_pdf_bytes_range(
f"{start_page_id + 1}-{end_page_id + 1}: {page_ranges}"
)
executor = ProcessPoolExecutor(max_workers=actual_threads)
executor = _get_pdf_render_executor()
recycle_executor = False
try:
futures = []
future_to_range = {}
@@ -159,7 +240,7 @@ def _load_images_from_pdf_bytes_range(
_, not_done = wait(futures, timeout=timeout, return_when=ALL_COMPLETED)
if not_done:
_terminate_executor_processes(executor)
recycle_executor = True
raise TimeoutError(
f"PDF image rendering timeout after {timeout}s "
f"for pages {start_page_id + 1}-{end_page_id + 1}"
@@ -177,12 +258,16 @@ def _load_images_from_pdf_bytes_range(
images_list.extend(imgs)
return images_list
except Exception as exc:
if not isinstance(exc, TimeoutError):
_terminate_executor_processes(executor)
except BrokenProcessPool:
recycle_executor = True
raise
finally:
executor.shutdown(wait=False, cancel_futures=True)
if recycle_executor:
logger.warning("Recycling persistent PDF render executor after render failure")
_recycle_pdf_render_executor(
executor,
terminate_processes=True,
)
def _terminate_executor_processes(executor):

View File

@@ -1 +1 @@
__version__ = "2.7.6"
__version__ = "3.0.3"