diff --git a/demo/1.md b/demo/1.md deleted file mode 100644 index fbc1284f..00000000 --- a/demo/1.md +++ /dev/null @@ -1 +0,0 @@ - ~~**xyz**~~ \ No newline at end of file diff --git a/mineru/backend/hybrid/hybrid_analyze.py b/mineru/backend/hybrid/hybrid_analyze.py index f850ab56..c8fe7751 100644 --- a/mineru/backend/hybrid/hybrid_analyze.py +++ b/mineru/backend/hybrid/hybrid_analyze.py @@ -16,6 +16,7 @@ from mineru.backend.hybrid.hybrid_model_output_to_middle_json import ( finalize_middle_json, init_middle_json, ) +from mineru.backend.utils import exclude_progress_bar_idle_time from mineru.backend.pipeline.model_init import HybridModelSingleton from mineru.backend.vlm.vlm_analyze import ( ModelSingleton, @@ -574,7 +575,9 @@ def doc_analyze( batch_ratio = get_batch_ratio(device) if not _vlm_ocr_enable else 1 infer_start = time.time() - with tqdm(total=page_count, desc="Processing pages") as progress_bar: + progress_bar = None + last_append_end_time = None + try: for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)): window_end = min(page_count - 1, window_start + effective_window_size - 1) images_list = load_images_from_pdf_doc( @@ -609,6 +612,14 @@ def doc_analyze( ) model_list.extend(window_model_list) + if progress_bar is None: + progress_bar = tqdm(total=page_count, desc="Processing pages") + else: + exclude_progress_bar_idle_time( + progress_bar, + last_append_end_time, + now=time.time(), + ) append_page_model_list_to_middle_json( middle_json, window_model_list, @@ -620,8 +631,12 @@ def doc_analyze( _vlm_ocr_enable=_vlm_ocr_enable, progress_bar=progress_bar, ) + last_append_end_time = time.time() finally: _close_images(images_list) + finally: + if progress_bar is not None: + progress_bar.close() infer_time = round(time.time() - infer_start, 2) if infer_time > 0 and page_count > 0: @@ -687,7 +702,9 @@ async def aio_doc_analyze( batch_ratio = get_batch_ratio(device) if not _vlm_ocr_enable else 1 infer_start = time.time() - with tqdm(total=page_count, desc="Processing pages") as progress_bar: + progress_bar = None + last_append_end_time = None + try: for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)): window_end = min(page_count - 1, window_start + effective_window_size - 1) images_list = load_images_from_pdf_doc( @@ -722,6 +739,14 @@ async def aio_doc_analyze( ) model_list.extend(window_model_list) + if progress_bar is None: + progress_bar = tqdm(total=page_count, desc="Processing pages") + else: + exclude_progress_bar_idle_time( + progress_bar, + last_append_end_time, + now=time.time(), + ) append_page_model_list_to_middle_json( middle_json, window_model_list, @@ -733,8 +758,12 @@ async def aio_doc_analyze( _vlm_ocr_enable=_vlm_ocr_enable, progress_bar=progress_bar, ) + last_append_end_time = time.time() finally: _close_images(images_list) + finally: + if progress_bar is not None: + progress_bar.close() infer_time = round(time.time() - infer_start, 2) if infer_time > 0 and page_count > 0: diff --git a/mineru/backend/pipeline/pipeline_analyze.py b/mineru/backend/pipeline/pipeline_analyze.py index 919fb86d..39ccf615 100644 --- a/mineru/backend/pipeline/pipeline_analyze.py +++ b/mineru/backend/pipeline/pipeline_analyze.py @@ -13,6 +13,7 @@ from .model_json_to_middle_json import ( finalize_middle_json, init_middle_json, ) +from ..utils import exclude_progress_bar_idle_time from mineru.utils.config_reader import get_device, get_processing_window_size from ...utils.enum_class import ImageType from ...utils.pdf_classify import classify @@ -186,7 +187,9 @@ def doc_analyze_streaming( processed_pages = 0 infer_start = time.time() try: - with tqdm(total=total_pages, desc="Processing pages") as progress_bar: + progress_bar = None + last_append_end_time = None + try: batch_index = 0 while processed_pages < total_pages: batch_index += 1 @@ -237,6 +240,14 @@ def doc_analyze_streaming( formula_enable=formula_enable, table_enable=table_enable, ) + if progress_bar is None: + progress_bar = tqdm(total=total_pages, desc="Processing pages") + else: + exclude_progress_bar_idle_time( + progress_bar, + last_append_end_time, + now=time.time(), + ) result_offset = 0 for context, images_list, page_start, take_count in batch_payloads: @@ -259,7 +270,11 @@ def doc_analyze_streaming( if context['next_page_idx'] >= context['page_count'] and not context['closed']: _finalize_processing_window_context(context, on_doc_ready) + last_append_end_time = time.time() processed_pages += len(batch_images) + finally: + if progress_bar is not None: + progress_bar.close() infer_time = round(time.time() - infer_start, 2) if infer_time > 0: diff --git a/mineru/backend/utils.py b/mineru/backend/utils.py index 4308507d..aedbd1c0 100644 --- a/mineru/backend/utils.py +++ b/mineru/backend/utils.py @@ -1,4 +1,5 @@ import os +import time from loguru import logger @@ -21,4 +22,24 @@ def cross_page_table_merge(pdf_info: list[dict]): pass else: logger.warning(f'unknown MINERU_TABLE_MERGE_ENABLE config: {is_merge_table}, pass') - pass \ No newline at end of file + pass + + +def exclude_progress_bar_idle_time(progress_bar, idle_since: float | None, now: float | None = None): + """Exclude non-processing idle time from a reused tqdm progress bar.""" + if progress_bar is None or idle_since is None: + return + + if now is None: + now = time.time() + + idle_duration = now - idle_since + if idle_duration <= 0: + return + + if hasattr(progress_bar, "start_t"): + progress_bar.start_t += idle_duration + if hasattr(progress_bar, "last_print_t"): + progress_bar.last_print_t = now + if hasattr(progress_bar, "last_print_n") and hasattr(progress_bar, "n"): + progress_bar.last_print_n = progress_bar.n diff --git a/mineru/backend/vlm/vlm_analyze.py b/mineru/backend/vlm/vlm_analyze.py index eb03cba3..ef5466da 100644 --- a/mineru/backend/vlm/vlm_analyze.py +++ b/mineru/backend/vlm/vlm_analyze.py @@ -17,6 +17,7 @@ from .model_output_to_middle_json import ( finalize_middle_json, init_middle_json, ) +from mineru.backend.utils import exclude_progress_bar_idle_time from ...data.data_reader_writer import DataWriter from mineru.utils.pdf_image_tools import load_images_from_pdf_doc from ...utils.check_sys_env import is_mac_os_version_supported @@ -321,7 +322,9 @@ def doc_analyze( ) infer_start = time.time() - with tqdm(total=page_count, desc="Processing pages") as progress_bar: + progress_bar = None + last_append_end_time = None + try: for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)): window_end = min(page_count - 1, window_start + effective_window_size - 1) images_list = load_images_from_pdf_doc( @@ -340,6 +343,14 @@ def doc_analyze( with predictor_execution_guard(predictor): window_results = predictor.batch_two_step_extract(images=images_pil_list) results.extend(window_results) + if progress_bar is None: + progress_bar = tqdm(total=page_count, desc="Processing pages") + else: + exclude_progress_bar_idle_time( + progress_bar, + last_append_end_time, + now=time.time(), + ) append_page_blocks_to_middle_json( middle_json, window_results, @@ -349,8 +360,12 @@ def doc_analyze( page_start_index=window_start, progress_bar=progress_bar, ) + last_append_end_time = time.time() finally: _close_images(images_list) + finally: + if progress_bar is not None: + progress_bar.close() infer_time = round(time.time() - infer_start, 2) if infer_time > 0 and page_count > 0: logger.debug( @@ -398,7 +413,9 @@ async def aio_doc_analyze( ) infer_start = time.time() - with tqdm(total=page_count, desc="Processing pages") as progress_bar: + progress_bar = None + last_append_end_time = None + try: for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)): window_end = min(page_count - 1, window_start + effective_window_size - 1) images_list = load_images_from_pdf_doc( @@ -417,6 +434,14 @@ async def aio_doc_analyze( async with aio_predictor_execution_guard(predictor): window_results = await predictor.aio_batch_two_step_extract(images=images_pil_list) results.extend(window_results) + if progress_bar is None: + progress_bar = tqdm(total=page_count, desc="Processing pages") + else: + exclude_progress_bar_idle_time( + progress_bar, + last_append_end_time, + now=time.time(), + ) append_page_blocks_to_middle_json( middle_json, window_results, @@ -426,8 +451,12 @@ async def aio_doc_analyze( page_start_index=window_start, progress_bar=progress_bar, ) + last_append_end_time = time.time() finally: _close_images(images_list) + finally: + if progress_bar is not None: + progress_bar.close() infer_time = round(time.time() - infer_start, 2) if infer_time > 0 and page_count > 0: logger.debug( diff --git a/mineru/cli/client.py b/mineru/cli/client.py index 8c73bf1c..813f6d08 100644 --- a/mineru/cli/client.py +++ b/mineru/cli/client.py @@ -1,6 +1,5 @@ # Copyright (c) Opendatalab. All rights reserved. import asyncio -import json import os import sys import threading @@ -186,10 +185,6 @@ class LiveTaskStatusRenderer: self._task_states.clear() self.clear_locked() - def snapshot_lines(self) -> list[str]: - with self.sink.lock: - return self._build_render_lines_locked() - def clear_locked(self) -> None: if self._rendered_line_count <= 0: return @@ -664,29 +659,6 @@ async def submit_task( ) -def submit_task_sync( - base_url: str, - planned_task: PlannedTask, - form_data: dict[str, str | list[str]], -) -> SubmitResponse: - try: - return _api_client.submit_parse_task_sync( - base_url=base_url, - upload_assets=[ - _api_client.UploadAsset( - path=document.path, - upload_name=f"{document.stem}{document.path.suffix}", - ) - for document in planned_task.documents - ], - form_data=form_data, - ) - except click.ClickException as exc: - raise click.ClickException( - f"Failed to submit {format_task_label(planned_task)}: {exc}" - ) from exc - - async def wait_for_task_result( client: httpx.AsyncClient, submit_response: SubmitResponse, diff --git a/mineru/cli/fast_api.py b/mineru/cli/fast_api.py index a047897c..6fc5804d 100644 --- a/mineru/cli/fast_api.py +++ b/mineru/cli/fast_api.py @@ -1,7 +1,6 @@ import asyncio import mimetypes import os -import re import shutil import sys import tempfile @@ -269,19 +268,6 @@ def validate_parse_method(parse_method: str) -> str: return parse_method -def sanitize_filename(filename: str) -> str: - """ - 格式化压缩文件的文件名 - 移除路径遍历字符, 保留 Unicode 字母、数字、._- - 禁止隐藏文件 - """ - sanitized = re.sub(r"[/\\.]{2,}|[/\\]", "", filename) - sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE) - if sanitized.startswith("."): - sanitized = "_" + sanitized[1:] - return sanitized or "unnamed" - - def cleanup_file(file_path: str) -> None: """清理临时文件或目录""" try: diff --git a/mineru/utils/llm_aided.py b/mineru/utils/llm_aided.py index d7e6da0c..aaff369b 100644 --- a/mineru/utils/llm_aided.py +++ b/mineru/utils/llm_aided.py @@ -287,14 +287,10 @@ def _offset_paragraph_title_levels(levels_by_index): if not levels_by_index: return levels_by_index - positive_levels = [level for level in levels_by_index.values() if level > 0] - if positive_levels and min(positive_levels) == 1: - return { - index: level + 1 if level > 0 else level - for index, level in levels_by_index.items() - } - - return levels_by_index + return { + index: 2 if level == 1 else level + for index, level in levels_by_index.items() + } def _request_paragraph_group_levels(title_block_refs, title_aided_config):