Merge pull request #4707 from opendatalab/master

master->dev
Update version.py with new version
2026-04-01 05:28:36 +07:00 · 2026-04-01 03:45:01 +08:00 · 2026-03-31 19:34:02 +00:00 · 2026-04-01 03:23:25 +08:00 · 2026-04-01 03:21:10 +08:00 · 2026-04-01 03:12:46 +08:00
5 changed files with 139 additions and 99 deletions
--- a/mineru/cli/api_client.py
+++ b/mineru/cli/api_client.py
@@ -110,6 +110,10 @@ class LocalAPIServer:
        self.process: subprocess.Popen[bytes] | None = None
        self._atexit_registered = False
        self.extra_cli_args = tuple(extra_cli_args)
+        # On Windows, the temporary FastAPI child process can stall during parsing
+        # startup when launched with stdin=PIPE and an EOF-based shutdown watcher.
+        # Use explicit process termination there instead of stdin-driven shutdown.
+        self._use_stdin_shutdown_watcher = os.name != "nt"

    def start(self) -> str:
        if self.process is not None:
@@ -124,7 +128,12 @@ class LocalAPIServer:
            read_max_concurrent_requests(default=DEFAULT_MAX_CONCURRENT_REQUESTS)
        )
        env["MINERU_API_DISABLE_ACCESS_LOG"] = "1"
-        env["MINERU_API_SHUTDOWN_ON_STDIN_EOF"] = "1"
+        if self._use_stdin_shutdown_watcher:
+            env["MINERU_API_SHUTDOWN_ON_STDIN_EOF"] = "1"
+            stdin_target = subprocess.PIPE
+        else:
+            env.pop("MINERU_API_SHUTDOWN_ON_STDIN_EOF", None)
+            stdin_target = subprocess.DEVNULL
        self.output_root.mkdir(parents=True, exist_ok=True)

        command = [
@@ -141,7 +150,7 @@ class LocalAPIServer:
            command,
            cwd=os.getcwd(),
            env=env,
-            stdin=subprocess.PIPE,
+            stdin=stdin_target,
        )

        if not self._atexit_registered:
@@ -154,23 +163,30 @@ class LocalAPIServer:
        self.process = None
        try:
            if process is not None and process.poll() is None:
-                if process.stdin is not None and not process.stdin.closed:
-                    process.stdin.close()
-                try:
-                    process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
-                except subprocess.TimeoutExpired:
-                    logger.debug(
-                        "Local mineru-api did not stop after stdin EOF within {}s. Falling back to SIGTERM.",
-                        LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS,
-                    )
-                    process.terminate()
+                if self._use_stdin_shutdown_watcher:
+                    if process.stdin is not None and not process.stdin.closed:
+                        process.stdin.close()
                    try:
                        process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
                        return
                    except subprocess.TimeoutExpired:
-                        pass
-                    process.kill()
+                        logger.debug(
+                            "Local mineru-api did not stop after stdin EOF within {}s. Falling back to SIGTERM.",
+                            LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS,
+                        )
+                else:
+                    logger.debug(
+                        "Stopping local mineru-api with process termination on Windows."
+                    )
+
+                process.terminate()
+                try:
                    process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
+                    return
+                except subprocess.TimeoutExpired:
+                    pass
+                process.kill()
+                process.wait(timeout=LOCAL_API_SHUTDOWN_TIMEOUT_SECONDS)
        finally:
            if self._atexit_registered:
                try:
--- a/mineru/cli/fast_api.py
+++ b/mineru/cli/fast_api.py
@@ -12,7 +12,7 @@ from contextlib import asynccontextmanager, suppress
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any, Optional
+from typing import Annotated, Any, Optional

 import click
 import uvicorn
@@ -81,6 +81,11 @@ FILE_PARSE_TASK_ID_HEADER = "X-MinerU-Task-Id"
 FILE_PARSE_TASK_STATUS_HEADER = "X-MinerU-Task-Status"
 FILE_PARSE_TASK_STATUS_URL_HEADER = "X-MinerU-Task-Status-Url"
 FILE_PARSE_TASK_RESULT_URL_HEADER = "X-MinerU-Task-Result-Url"
+SWAGGER_UI_FILE_ARRAY_SCHEMA_EXTRA = {
+    # Swagger UI 5 currently fails to render a usable multi-file picker when
+    # FastAPI emits OpenAPI 3.1 byte arrays with contentMediaType.
+    "items": {"type": "string", "format": "binary"}
+}

 # 并发控制器
 _request_semaphore: Optional[asyncio.Semaphore] = None
@@ -609,6 +614,7 @@ def build_result_response(
    return_images: bool,
    response_format_zip: bool,
    return_original_file: bool,
+    zip_filename: str = "results.zip",
 ) -> Response:
    if response_format_zip:
        zip_path = create_result_zip(
@@ -627,7 +633,7 @@ def build_result_response(
        return FileResponse(
            path=zip_path,
            media_type="application/zip",
-            filename="results.zip",
+            filename=zip_filename,
            status_code=status_code,
        )

@@ -683,6 +689,7 @@ def build_sync_file_parse_response(
            return_images=task.return_images,
            response_format_zip=task.response_format_zip,
            return_original_file=task.return_original_file,
+            zip_filename=f"{task.task_id}.zip",
        )
        response.headers[FILE_PARSE_TASK_ID_HEADER] = task.task_id
        response.headers[FILE_PARSE_TASK_STATUS_HEADER] = task.status
@@ -713,12 +720,17 @@ def build_sync_file_parse_response(


 async def parse_request_form(
-    files: list[UploadFile] = File(
-        ..., description="Upload pdf or image files for parsing"
-    ),
-    lang_list: list[str] = Form(
-        ["ch"],
-        description="""(Adapted only for pipeline and hybrid backend)Input the languages in the pdf to improve OCR accuracy.Options:
+    files: Annotated[
+        list[UploadFile],
+        File(
+            description="Upload pdf or image files for parsing",
+            json_schema_extra=SWAGGER_UI_FILE_ARRAY_SCHEMA_EXTRA,
+        ),
+    ],
+    lang_list: Annotated[
+        list[str],
+        Form(
+            description="""(Adapted only for pipeline and hybrid backend)Input the languages in the pdf to improve OCR accuracy.Options:
 - ch: Chinese, English, Chinese Traditional.
 - ch_lite: Chinese, English, Chinese Traditional, Japanese.
 - ch_server: Chinese, English, Chinese Traditional, Japanese.
@@ -737,59 +749,84 @@ async def parse_request_form(
 - cyrillic: Russian, Belarusian, Ukrainian, Serbian (Cyrillic), Bulgarian, Mongolian, Abkhazian, Adyghe, Kabardian, Avar, Dargin, Ingush, Chechen, Lak, Lezgin, Tabasaran, Kazakh, Kyrgyz, Tajik, Macedonian, Tatar, Chuvash, Bashkir, Malian, Moldovan, Udmurt, Komi, Ossetian, Buryat, Kalmyk, Tuvan, Sakha, Karakalpak, English.
 - devanagari: Hindi, Marathi, Nepali, Bihari, Maithili, Angika, Bhojpuri, Magahi, Santali, Newari, Konkani, Sanskrit, Haryanvi, English.
 """,
-    ),
-    backend: str = Form(
-        "hybrid-auto-engine",
-        description="""The backend for parsing:
+        ),
+    ] = ["ch"],
+    backend: Annotated[
+        str,
+        Form(
+            description="""The backend for parsing:
 - pipeline: More general, supports multiple languages, hallucination-free.
 - vlm-auto-engine: High accuracy via local computing power, supports Chinese and English documents only.
 - vlm-http-client: High accuracy via remote computing power(client suitable for openai-compatible servers), supports Chinese and English documents only.
 - hybrid-auto-engine: Next-generation high accuracy solution via local computing power, supports multiple languages.
 - hybrid-http-client: High accuracy via remote computing power but requires a little local computing power(client suitable for openai-compatible servers), supports multiple languages.""",
-    ),
-    parse_method: str = Form(
-        "auto",
-        description="""(Adapted only for pipeline and hybrid backend)The method for parsing PDF:
+        ),
+    ] = "hybrid-auto-engine",
+    parse_method: Annotated[
+        str,
+        Form(
+            description="""(Adapted only for pipeline and hybrid backend)The method for parsing PDF:
 - auto: Automatically determine the method based on the file type
 - txt: Use text extraction method
 - ocr: Use OCR method for image-based PDFs
 """,
-    ),
-    formula_enable: bool = Form(True, description="Enable formula parsing."),
-    table_enable: bool = Form(True, description="Enable table parsing."),
-    server_url: Optional[str] = Form(
-        None,
-        description="(Adapted only for <vlm/hybrid>-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000",
-    ),
-    return_md: bool = Form(True, description="Return markdown content in response"),
-    return_middle_json: bool = Form(
-        False, description="Return middle JSON in response"
-    ),
-    return_model_output: bool = Form(
-        False, description="Return model output JSON in response"
-    ),
-    return_content_list: bool = Form(
-        False, description="Return content list JSON in response"
-    ),
-    return_images: bool = Form(
-        False, description="Return extracted images in response"
-    ),
-    response_format_zip: bool = Form(
-        False, description="Return results as a ZIP file instead of JSON"
-    ),
-    return_original_file: bool = Form(
-        False,
-        description=(
-            "Include the processed original input file in the ZIP result; "
-            "ignored unless response_format_zip=true"
        ),
-    ),
-    start_page_id: int = Form(
-        0, description="The starting page for PDF parsing, beginning from 0"
-    ),
-    end_page_id: int = Form(
-        99999, description="The ending page for PDF parsing, beginning from 0"
-    ),
+    ] = "auto",
+    formula_enable: Annotated[
+        bool,
+        Form(description="Enable formula parsing."),
+    ] = True,
+    table_enable: Annotated[
+        bool,
+        Form(description="Enable table parsing."),
+    ] = True,
+    server_url: Annotated[
+        Optional[str],
+        Form(
+            description="(Adapted only for <vlm/hybrid>-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000",
+        ),
+    ] = None,
+    return_md: Annotated[
+        bool,
+        Form(description="Return markdown content in response"),
+    ] = True,
+    return_middle_json: Annotated[
+        bool,
+        Form(description="Return middle JSON in response"),
+    ] = False,
+    return_model_output: Annotated[
+        bool,
+        Form(description="Return model output JSON in response"),
+    ] = False,
+    return_content_list: Annotated[
+        bool,
+        Form(description="Return content list JSON in response"),
+    ] = False,
+    return_images: Annotated[
+        bool,
+        Form(description="Return extracted images in response"),
+    ] = False,
+    response_format_zip: Annotated[
+        bool,
+        Form(description="Return results as a ZIP file instead of JSON"),
+    ] = False,
+    return_original_file: Annotated[
+        bool,
+        Form(
+            description=(
+                "Include the processed original input file in the ZIP result; "
+                "ignored unless response_format_zip=true"
+            ),
+        ),
+    ] = False,
+    start_page_id: Annotated[
+        int,
+        Form(description="The starting page for PDF parsing, beginning from 0"),
+    ] = 0,
+    end_page_id: Annotated[
+        int,
+        Form(description="The ending page for PDF parsing, beginning from 0"),
+    ] = 99999,
 ) -> ParseRequestOptions:
    effective_return_original_file = return_original_file and response_format_zip
    return ParseRequestOptions(
@@ -1295,7 +1332,9 @@ def get_task_manager() -> AsyncTaskManager:
 async def parse_pdf(
    http_request: Request,
    background_tasks: BackgroundTasks,
-    request_options: ParseRequestOptions = Depends(parse_request_form),
+    request_options: Annotated[
+        ParseRequestOptions, Depends(parse_request_form)
+    ],
 ):
    task = await create_async_parse_task(request_options)
    request_options = None
@@ -1340,7 +1379,9 @@ async def parse_pdf(
 )
 async def submit_parse_task(
    http_request: Request,
-    request_options: ParseRequestOptions = Depends(parse_request_form),
+    request_options: Annotated[
+        ParseRequestOptions, Depends(parse_request_form)
+    ],
 ):
    task_manager = get_task_manager()
    task = await create_async_parse_task(request_options)
@@ -1399,6 +1440,7 @@ async def get_async_task_result(
        return_images=task.return_images,
        response_format_zip=task.response_format_zip,
        return_original_file=task.return_original_file,
+        zip_filename=f"{task.task_id}.zip",
    )


--- a/mineru/model/utils/pytorchocr/modeling/backbones/rec_pphgnetv2.py
+++ b/mineru/model/utils/pytorchocr/modeling/backbones/rec_pphgnetv2.py
@@ -959,23 +959,6 @@ class LightConvBNAct(TheseusLayer):
        return x


-class PaddingSameAsPaddleMaxPool2d(torch.nn.Module):
-    def __init__(self, kernel_size, stride=1):
-        super().__init__()
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.pool = torch.nn.MaxPool2d(kernel_size, stride, padding=0, ceil_mode=True)
-
-    def forward(self, x):
-        _, _, h, w = x.shape
-        pad_h_total = max(0, (math.ceil(h / self.stride) - 1) * self.stride + self.kernel_size - h)
-        pad_w_total = max(0, (math.ceil(w / self.stride) - 1) * self.stride + self.kernel_size - w)
-        pad_h = pad_h_total // 2
-        pad_w = pad_w_total // 2
-        x = torch.nn.functional.pad(x, [pad_w, pad_w_total - pad_w, pad_h, pad_h_total - pad_h])
-        return self.pool(x)
-
-
 class StemBlock(TheseusLayer):
    """
    StemBlock for PP-HGNetV2.
@@ -1011,7 +994,6 @@ class StemBlock(TheseusLayer):
            out_channels=mid_channels // 2,
            kernel_size=2,
            stride=1,
-            padding="same",
            use_lab=use_lab,
            lr_mult=lr_mult,
        )
@@ -1020,7 +1002,6 @@ class StemBlock(TheseusLayer):
            out_channels=mid_channels,
            kernel_size=2,
            stride=1,
-            padding="same",
            use_lab=use_lab,
            lr_mult=lr_mult,
        )
@@ -1040,20 +1021,20 @@ class StemBlock(TheseusLayer):
            use_lab=use_lab,
            lr_mult=lr_mult,
        )
-        self.pool = PaddingSameAsPaddleMaxPool2d(
-            kernel_size=2, stride=1,
-        )
+        self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)

    def forward(self, x):
-        x = self.stem1(x)
-        x2 = self.stem2a(x)
-        x2 = self.stem2b(x2)
-        x1 = self.pool(x)
-        x = torch.cat([x1, x2], 1)
-        x = self.stem3(x)
-        x = self.stem4(x)
+        embedding = self.stem1(x)
+        embedding = F.pad(embedding, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2a(embedding)
+        emb_stem_2a = F.pad(emb_stem_2a, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2b(emb_stem_2a)
+        pooled_emb = self.pool(embedding)
+        embedding = torch.cat([pooled_emb, emb_stem_2a], 1)
+        embedding = self.stem3(embedding)
+        embedding = self.stem4(embedding)

-        return x
+        return embedding


 class HGV2_Block(TheseusLayer):
--- a/mineru/version.py
+++ b/mineru/version.py
@@ -1 +1 @@
-__version__ = "3.0.3"
+__version__ = "3.0.5"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,6 +87,7 @@ pipeline = [
    "torchvision",
    "transformers>=4.57.3,<5.0.0",
    "onnxruntime>1.17.0",
+    "albumentations>=2.0.8,<3",
 ]
 gradio = [
    "gradio>=5.49.1,!=6.0.0,!=6.0.1,!=6.0.2,!=6.1.0,!=6.2.0,!=6.3.0,!=6.4.0,!=6.5.0,!=6.5.1,!=6.6.0,!=6.7.0,<6.9.0",
@@ -101,7 +102,7 @@ all = [
    "mineru[core]",
    "mineru[mlx] ; sys_platform == 'darwin'",
    "mineru[vllm] ; sys_platform == 'linux'",
-    "mineru[lmdeploy] ; sys_platform == 'windows'",
+    "mineru[lmdeploy] ; sys_platform == 'win32'",
 ]

 [project.urls]
Author	SHA1	Message	Date
Xiaomeng Zhao	887758e99d	Merge pull request #4707 from opendatalab/master master->dev	2026-04-01 03:45:01 +08:00
myhloli	31f368ab85	Update version.py with new version	2026-03-31 19:34:02 +00:00
Xiaomeng Zhao	2c65149062	Merge pull request #4706 from opendatalab/dev 3.0.5	2026-04-01 03:23:25 +08:00
Xiaomeng Zhao	e976ca2af0	Merge pull request #4705 from myhloli/dev feat: allow custom zip filename for response in FastAPI file handling	2026-04-01 03:21:10 +08:00
myhloli	87a14040c0	feat: allow custom zip filename for response in FastAPI file handling	2026-04-01 03:12:46 +08:00
Xiaomeng Zhao	739c6343b6	Merge pull request #4704 from opendatalab/dev 3.0.5	2026-04-01 02:46:04 +08:00
Xiaomeng Zhao	11a9a9465e	Merge pull request #4703 from myhloli/dev Dev	2026-04-01 02:44:53 +08:00
myhloli	b583702df1	fix: improve shutdown handling for FastAPI child process on Windows	2026-04-01 02:28:22 +08:00
myhloli	1ca160fdc2	refactor: replace PaddingSameAsPaddleMaxPool2d with torch's MaxPool2d and improve forward method structure	2026-03-31 23:50:52 +08:00
myhloli	93d5251e18	feat: use Annotated for request form parameters in parse_request_form	2026-03-31 23:19:09 +08:00
myhloli	29f767029e	feat: add custom JSON schema for file upload in Swagger UI	2026-03-31 22:56:55 +08:00
myhloli	39b903f029	fix: update sys_platform identifier for Windows in pyproject.toml	2026-03-31 22:21:33 +08:00
myhloli	3d508abfd1	feat: add albumentations dependency to pyproject.toml	2026-03-31 20:11:24 +08:00
Xiaomeng Zhao	b9485f1014	Merge pull request #4695 from opendatalab/master master->dev	2026-03-31 01:51:59 +08:00
myhloli	5869af336b	Update version.py with new version	2026-03-30 17:49:00 +00:00