Merge pull request #4665 from myhloli/dev

Dev
This commit is contained in:
Xiaomeng Zhao
2026-03-27 02:01:58 +08:00
committed by GitHub
8 changed files with 104 additions and 57 deletions

View File

@@ -1 +0,0 @@
~~<u>**xyz**</u>~~

View File

@@ -16,6 +16,7 @@ from mineru.backend.hybrid.hybrid_model_output_to_middle_json import (
finalize_middle_json,
init_middle_json,
)
from mineru.backend.utils import exclude_progress_bar_idle_time
from mineru.backend.pipeline.model_init import HybridModelSingleton
from mineru.backend.vlm.vlm_analyze import (
ModelSingleton,
@@ -574,7 +575,9 @@ def doc_analyze(
batch_ratio = get_batch_ratio(device) if not _vlm_ocr_enable else 1
infer_start = time.time()
with tqdm(total=page_count, desc="Processing pages") as progress_bar:
progress_bar = None
last_append_end_time = None
try:
for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)):
window_end = min(page_count - 1, window_start + effective_window_size - 1)
images_list = load_images_from_pdf_doc(
@@ -609,6 +612,14 @@ def doc_analyze(
)
model_list.extend(window_model_list)
if progress_bar is None:
progress_bar = tqdm(total=page_count, desc="Processing pages")
else:
exclude_progress_bar_idle_time(
progress_bar,
last_append_end_time,
now=time.time(),
)
append_page_model_list_to_middle_json(
middle_json,
window_model_list,
@@ -620,8 +631,12 @@ def doc_analyze(
_vlm_ocr_enable=_vlm_ocr_enable,
progress_bar=progress_bar,
)
last_append_end_time = time.time()
finally:
_close_images(images_list)
finally:
if progress_bar is not None:
progress_bar.close()
infer_time = round(time.time() - infer_start, 2)
if infer_time > 0 and page_count > 0:
@@ -687,7 +702,9 @@ async def aio_doc_analyze(
batch_ratio = get_batch_ratio(device) if not _vlm_ocr_enable else 1
infer_start = time.time()
with tqdm(total=page_count, desc="Processing pages") as progress_bar:
progress_bar = None
last_append_end_time = None
try:
for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)):
window_end = min(page_count - 1, window_start + effective_window_size - 1)
images_list = load_images_from_pdf_doc(
@@ -722,6 +739,14 @@ async def aio_doc_analyze(
)
model_list.extend(window_model_list)
if progress_bar is None:
progress_bar = tqdm(total=page_count, desc="Processing pages")
else:
exclude_progress_bar_idle_time(
progress_bar,
last_append_end_time,
now=time.time(),
)
append_page_model_list_to_middle_json(
middle_json,
window_model_list,
@@ -733,8 +758,12 @@ async def aio_doc_analyze(
_vlm_ocr_enable=_vlm_ocr_enable,
progress_bar=progress_bar,
)
last_append_end_time = time.time()
finally:
_close_images(images_list)
finally:
if progress_bar is not None:
progress_bar.close()
infer_time = round(time.time() - infer_start, 2)
if infer_time > 0 and page_count > 0:

View File

@@ -13,6 +13,7 @@ from .model_json_to_middle_json import (
finalize_middle_json,
init_middle_json,
)
from ..utils import exclude_progress_bar_idle_time
from mineru.utils.config_reader import get_device, get_processing_window_size
from ...utils.enum_class import ImageType
from ...utils.pdf_classify import classify
@@ -186,7 +187,9 @@ def doc_analyze_streaming(
processed_pages = 0
infer_start = time.time()
try:
with tqdm(total=total_pages, desc="Processing pages") as progress_bar:
progress_bar = None
last_append_end_time = None
try:
batch_index = 0
while processed_pages < total_pages:
batch_index += 1
@@ -237,6 +240,14 @@ def doc_analyze_streaming(
formula_enable=formula_enable,
table_enable=table_enable,
)
if progress_bar is None:
progress_bar = tqdm(total=total_pages, desc="Processing pages")
else:
exclude_progress_bar_idle_time(
progress_bar,
last_append_end_time,
now=time.time(),
)
result_offset = 0
for context, images_list, page_start, take_count in batch_payloads:
@@ -259,7 +270,11 @@ def doc_analyze_streaming(
if context['next_page_idx'] >= context['page_count'] and not context['closed']:
_finalize_processing_window_context(context, on_doc_ready)
last_append_end_time = time.time()
processed_pages += len(batch_images)
finally:
if progress_bar is not None:
progress_bar.close()
infer_time = round(time.time() - infer_start, 2)
if infer_time > 0:

View File

@@ -1,4 +1,5 @@
import os
import time
from loguru import logger
@@ -21,4 +22,24 @@ def cross_page_table_merge(pdf_info: list[dict]):
pass
else:
logger.warning(f'unknown MINERU_TABLE_MERGE_ENABLE config: {is_merge_table}, pass')
pass
pass
def exclude_progress_bar_idle_time(progress_bar, idle_since: float | None, now: float | None = None):
"""Exclude non-processing idle time from a reused tqdm progress bar."""
if progress_bar is None or idle_since is None:
return
if now is None:
now = time.time()
idle_duration = now - idle_since
if idle_duration <= 0:
return
if hasattr(progress_bar, "start_t"):
progress_bar.start_t += idle_duration
if hasattr(progress_bar, "last_print_t"):
progress_bar.last_print_t = now
if hasattr(progress_bar, "last_print_n") and hasattr(progress_bar, "n"):
progress_bar.last_print_n = progress_bar.n

View File

@@ -17,6 +17,7 @@ from .model_output_to_middle_json import (
finalize_middle_json,
init_middle_json,
)
from mineru.backend.utils import exclude_progress_bar_idle_time
from ...data.data_reader_writer import DataWriter
from mineru.utils.pdf_image_tools import load_images_from_pdf_doc
from ...utils.check_sys_env import is_mac_os_version_supported
@@ -321,7 +322,9 @@ def doc_analyze(
)
infer_start = time.time()
with tqdm(total=page_count, desc="Processing pages") as progress_bar:
progress_bar = None
last_append_end_time = None
try:
for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)):
window_end = min(page_count - 1, window_start + effective_window_size - 1)
images_list = load_images_from_pdf_doc(
@@ -340,6 +343,14 @@ def doc_analyze(
with predictor_execution_guard(predictor):
window_results = predictor.batch_two_step_extract(images=images_pil_list)
results.extend(window_results)
if progress_bar is None:
progress_bar = tqdm(total=page_count, desc="Processing pages")
else:
exclude_progress_bar_idle_time(
progress_bar,
last_append_end_time,
now=time.time(),
)
append_page_blocks_to_middle_json(
middle_json,
window_results,
@@ -349,8 +360,12 @@ def doc_analyze(
page_start_index=window_start,
progress_bar=progress_bar,
)
last_append_end_time = time.time()
finally:
_close_images(images_list)
finally:
if progress_bar is not None:
progress_bar.close()
infer_time = round(time.time() - infer_start, 2)
if infer_time > 0 and page_count > 0:
logger.debug(
@@ -398,7 +413,9 @@ async def aio_doc_analyze(
)
infer_start = time.time()
with tqdm(total=page_count, desc="Processing pages") as progress_bar:
progress_bar = None
last_append_end_time = None
try:
for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)):
window_end = min(page_count - 1, window_start + effective_window_size - 1)
images_list = load_images_from_pdf_doc(
@@ -417,6 +434,14 @@ async def aio_doc_analyze(
async with aio_predictor_execution_guard(predictor):
window_results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
results.extend(window_results)
if progress_bar is None:
progress_bar = tqdm(total=page_count, desc="Processing pages")
else:
exclude_progress_bar_idle_time(
progress_bar,
last_append_end_time,
now=time.time(),
)
append_page_blocks_to_middle_json(
middle_json,
window_results,
@@ -426,8 +451,12 @@ async def aio_doc_analyze(
page_start_index=window_start,
progress_bar=progress_bar,
)
last_append_end_time = time.time()
finally:
_close_images(images_list)
finally:
if progress_bar is not None:
progress_bar.close()
infer_time = round(time.time() - infer_start, 2)
if infer_time > 0 and page_count > 0:
logger.debug(

View File

@@ -1,6 +1,5 @@
# Copyright (c) Opendatalab. All rights reserved.
import asyncio
import json
import os
import sys
import threading
@@ -186,10 +185,6 @@ class LiveTaskStatusRenderer:
self._task_states.clear()
self.clear_locked()
def snapshot_lines(self) -> list[str]:
with self.sink.lock:
return self._build_render_lines_locked()
def clear_locked(self) -> None:
if self._rendered_line_count <= 0:
return
@@ -664,29 +659,6 @@ async def submit_task(
)
def submit_task_sync(
base_url: str,
planned_task: PlannedTask,
form_data: dict[str, str | list[str]],
) -> SubmitResponse:
try:
return _api_client.submit_parse_task_sync(
base_url=base_url,
upload_assets=[
_api_client.UploadAsset(
path=document.path,
upload_name=f"{document.stem}{document.path.suffix}",
)
for document in planned_task.documents
],
form_data=form_data,
)
except click.ClickException as exc:
raise click.ClickException(
f"Failed to submit {format_task_label(planned_task)}: {exc}"
) from exc
async def wait_for_task_result(
client: httpx.AsyncClient,
submit_response: SubmitResponse,

View File

@@ -1,7 +1,6 @@
import asyncio
import mimetypes
import os
import re
import shutil
import sys
import tempfile
@@ -269,19 +268,6 @@ def validate_parse_method(parse_method: str) -> str:
return parse_method
def sanitize_filename(filename: str) -> str:
"""
格式化压缩文件的文件名
移除路径遍历字符, 保留 Unicode 字母、数字、._-
禁止隐藏文件
"""
sanitized = re.sub(r"[/\\.]{2,}|[/\\]", "", filename)
sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
if sanitized.startswith("."):
sanitized = "_" + sanitized[1:]
return sanitized or "unnamed"
def cleanup_file(file_path: str) -> None:
"""清理临时文件或目录"""
try:

View File

@@ -287,14 +287,10 @@ def _offset_paragraph_title_levels(levels_by_index):
if not levels_by_index:
return levels_by_index
positive_levels = [level for level in levels_by_index.values() if level > 0]
if positive_levels and min(positive_levels) == 1:
return {
index: level + 1 if level > 0 else level
for index, level in levels_by_index.items()
}
return levels_by_index
return {
index: 2 if level == 1 else level
for index, level in levels_by_index.items()
}
def _request_paragraph_group_levels(title_block_refs, title_aided_config):