mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 02:58:54 +07:00
@@ -16,6 +16,7 @@ from mineru.backend.hybrid.hybrid_model_output_to_middle_json import (
|
||||
finalize_middle_json,
|
||||
init_middle_json,
|
||||
)
|
||||
from mineru.backend.utils import exclude_progress_bar_idle_time
|
||||
from mineru.backend.pipeline.model_init import HybridModelSingleton
|
||||
from mineru.backend.vlm.vlm_analyze import (
|
||||
ModelSingleton,
|
||||
@@ -574,7 +575,9 @@ def doc_analyze(
|
||||
batch_ratio = get_batch_ratio(device) if not _vlm_ocr_enable else 1
|
||||
|
||||
infer_start = time.time()
|
||||
with tqdm(total=page_count, desc="Processing pages") as progress_bar:
|
||||
progress_bar = None
|
||||
last_append_end_time = None
|
||||
try:
|
||||
for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)):
|
||||
window_end = min(page_count - 1, window_start + effective_window_size - 1)
|
||||
images_list = load_images_from_pdf_doc(
|
||||
@@ -609,6 +612,14 @@ def doc_analyze(
|
||||
)
|
||||
|
||||
model_list.extend(window_model_list)
|
||||
if progress_bar is None:
|
||||
progress_bar = tqdm(total=page_count, desc="Processing pages")
|
||||
else:
|
||||
exclude_progress_bar_idle_time(
|
||||
progress_bar,
|
||||
last_append_end_time,
|
||||
now=time.time(),
|
||||
)
|
||||
append_page_model_list_to_middle_json(
|
||||
middle_json,
|
||||
window_model_list,
|
||||
@@ -620,8 +631,12 @@ def doc_analyze(
|
||||
_vlm_ocr_enable=_vlm_ocr_enable,
|
||||
progress_bar=progress_bar,
|
||||
)
|
||||
last_append_end_time = time.time()
|
||||
finally:
|
||||
_close_images(images_list)
|
||||
finally:
|
||||
if progress_bar is not None:
|
||||
progress_bar.close()
|
||||
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
if infer_time > 0 and page_count > 0:
|
||||
@@ -687,7 +702,9 @@ async def aio_doc_analyze(
|
||||
batch_ratio = get_batch_ratio(device) if not _vlm_ocr_enable else 1
|
||||
|
||||
infer_start = time.time()
|
||||
with tqdm(total=page_count, desc="Processing pages") as progress_bar:
|
||||
progress_bar = None
|
||||
last_append_end_time = None
|
||||
try:
|
||||
for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)):
|
||||
window_end = min(page_count - 1, window_start + effective_window_size - 1)
|
||||
images_list = load_images_from_pdf_doc(
|
||||
@@ -722,6 +739,14 @@ async def aio_doc_analyze(
|
||||
)
|
||||
|
||||
model_list.extend(window_model_list)
|
||||
if progress_bar is None:
|
||||
progress_bar = tqdm(total=page_count, desc="Processing pages")
|
||||
else:
|
||||
exclude_progress_bar_idle_time(
|
||||
progress_bar,
|
||||
last_append_end_time,
|
||||
now=time.time(),
|
||||
)
|
||||
append_page_model_list_to_middle_json(
|
||||
middle_json,
|
||||
window_model_list,
|
||||
@@ -733,8 +758,12 @@ async def aio_doc_analyze(
|
||||
_vlm_ocr_enable=_vlm_ocr_enable,
|
||||
progress_bar=progress_bar,
|
||||
)
|
||||
last_append_end_time = time.time()
|
||||
finally:
|
||||
_close_images(images_list)
|
||||
finally:
|
||||
if progress_bar is not None:
|
||||
progress_bar.close()
|
||||
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
if infer_time > 0 and page_count > 0:
|
||||
|
||||
@@ -13,6 +13,7 @@ from .model_json_to_middle_json import (
|
||||
finalize_middle_json,
|
||||
init_middle_json,
|
||||
)
|
||||
from ..utils import exclude_progress_bar_idle_time
|
||||
from mineru.utils.config_reader import get_device, get_processing_window_size
|
||||
from ...utils.enum_class import ImageType
|
||||
from ...utils.pdf_classify import classify
|
||||
@@ -186,7 +187,9 @@ def doc_analyze_streaming(
|
||||
processed_pages = 0
|
||||
infer_start = time.time()
|
||||
try:
|
||||
with tqdm(total=total_pages, desc="Processing pages") as progress_bar:
|
||||
progress_bar = None
|
||||
last_append_end_time = None
|
||||
try:
|
||||
batch_index = 0
|
||||
while processed_pages < total_pages:
|
||||
batch_index += 1
|
||||
@@ -237,6 +240,14 @@ def doc_analyze_streaming(
|
||||
formula_enable=formula_enable,
|
||||
table_enable=table_enable,
|
||||
)
|
||||
if progress_bar is None:
|
||||
progress_bar = tqdm(total=total_pages, desc="Processing pages")
|
||||
else:
|
||||
exclude_progress_bar_idle_time(
|
||||
progress_bar,
|
||||
last_append_end_time,
|
||||
now=time.time(),
|
||||
)
|
||||
|
||||
result_offset = 0
|
||||
for context, images_list, page_start, take_count in batch_payloads:
|
||||
@@ -259,7 +270,11 @@ def doc_analyze_streaming(
|
||||
if context['next_page_idx'] >= context['page_count'] and not context['closed']:
|
||||
_finalize_processing_window_context(context, on_doc_ready)
|
||||
|
||||
last_append_end_time = time.time()
|
||||
processed_pages += len(batch_images)
|
||||
finally:
|
||||
if progress_bar is not None:
|
||||
progress_bar.close()
|
||||
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
if infer_time > 0:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -21,4 +22,24 @@ def cross_page_table_merge(pdf_info: list[dict]):
|
||||
pass
|
||||
else:
|
||||
logger.warning(f'unknown MINERU_TABLE_MERGE_ENABLE config: {is_merge_table}, pass')
|
||||
pass
|
||||
pass
|
||||
|
||||
|
||||
def exclude_progress_bar_idle_time(progress_bar, idle_since: float | None, now: float | None = None):
|
||||
"""Exclude non-processing idle time from a reused tqdm progress bar."""
|
||||
if progress_bar is None or idle_since is None:
|
||||
return
|
||||
|
||||
if now is None:
|
||||
now = time.time()
|
||||
|
||||
idle_duration = now - idle_since
|
||||
if idle_duration <= 0:
|
||||
return
|
||||
|
||||
if hasattr(progress_bar, "start_t"):
|
||||
progress_bar.start_t += idle_duration
|
||||
if hasattr(progress_bar, "last_print_t"):
|
||||
progress_bar.last_print_t = now
|
||||
if hasattr(progress_bar, "last_print_n") and hasattr(progress_bar, "n"):
|
||||
progress_bar.last_print_n = progress_bar.n
|
||||
|
||||
@@ -17,6 +17,7 @@ from .model_output_to_middle_json import (
|
||||
finalize_middle_json,
|
||||
init_middle_json,
|
||||
)
|
||||
from mineru.backend.utils import exclude_progress_bar_idle_time
|
||||
from ...data.data_reader_writer import DataWriter
|
||||
from mineru.utils.pdf_image_tools import load_images_from_pdf_doc
|
||||
from ...utils.check_sys_env import is_mac_os_version_supported
|
||||
@@ -321,7 +322,9 @@ def doc_analyze(
|
||||
)
|
||||
|
||||
infer_start = time.time()
|
||||
with tqdm(total=page_count, desc="Processing pages") as progress_bar:
|
||||
progress_bar = None
|
||||
last_append_end_time = None
|
||||
try:
|
||||
for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)):
|
||||
window_end = min(page_count - 1, window_start + effective_window_size - 1)
|
||||
images_list = load_images_from_pdf_doc(
|
||||
@@ -340,6 +343,14 @@ def doc_analyze(
|
||||
with predictor_execution_guard(predictor):
|
||||
window_results = predictor.batch_two_step_extract(images=images_pil_list)
|
||||
results.extend(window_results)
|
||||
if progress_bar is None:
|
||||
progress_bar = tqdm(total=page_count, desc="Processing pages")
|
||||
else:
|
||||
exclude_progress_bar_idle_time(
|
||||
progress_bar,
|
||||
last_append_end_time,
|
||||
now=time.time(),
|
||||
)
|
||||
append_page_blocks_to_middle_json(
|
||||
middle_json,
|
||||
window_results,
|
||||
@@ -349,8 +360,12 @@ def doc_analyze(
|
||||
page_start_index=window_start,
|
||||
progress_bar=progress_bar,
|
||||
)
|
||||
last_append_end_time = time.time()
|
||||
finally:
|
||||
_close_images(images_list)
|
||||
finally:
|
||||
if progress_bar is not None:
|
||||
progress_bar.close()
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
if infer_time > 0 and page_count > 0:
|
||||
logger.debug(
|
||||
@@ -398,7 +413,9 @@ async def aio_doc_analyze(
|
||||
)
|
||||
|
||||
infer_start = time.time()
|
||||
with tqdm(total=page_count, desc="Processing pages") as progress_bar:
|
||||
progress_bar = None
|
||||
last_append_end_time = None
|
||||
try:
|
||||
for window_index, window_start in enumerate(range(0, page_count, effective_window_size or 1)):
|
||||
window_end = min(page_count - 1, window_start + effective_window_size - 1)
|
||||
images_list = load_images_from_pdf_doc(
|
||||
@@ -417,6 +434,14 @@ async def aio_doc_analyze(
|
||||
async with aio_predictor_execution_guard(predictor):
|
||||
window_results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
|
||||
results.extend(window_results)
|
||||
if progress_bar is None:
|
||||
progress_bar = tqdm(total=page_count, desc="Processing pages")
|
||||
else:
|
||||
exclude_progress_bar_idle_time(
|
||||
progress_bar,
|
||||
last_append_end_time,
|
||||
now=time.time(),
|
||||
)
|
||||
append_page_blocks_to_middle_json(
|
||||
middle_json,
|
||||
window_results,
|
||||
@@ -426,8 +451,12 @@ async def aio_doc_analyze(
|
||||
page_start_index=window_start,
|
||||
progress_bar=progress_bar,
|
||||
)
|
||||
last_append_end_time = time.time()
|
||||
finally:
|
||||
_close_images(images_list)
|
||||
finally:
|
||||
if progress_bar is not None:
|
||||
progress_bar.close()
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
if infer_time > 0 and page_count > 0:
|
||||
logger.debug(
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
@@ -186,10 +185,6 @@ class LiveTaskStatusRenderer:
|
||||
self._task_states.clear()
|
||||
self.clear_locked()
|
||||
|
||||
def snapshot_lines(self) -> list[str]:
|
||||
with self.sink.lock:
|
||||
return self._build_render_lines_locked()
|
||||
|
||||
def clear_locked(self) -> None:
|
||||
if self._rendered_line_count <= 0:
|
||||
return
|
||||
@@ -664,29 +659,6 @@ async def submit_task(
|
||||
)
|
||||
|
||||
|
||||
def submit_task_sync(
|
||||
base_url: str,
|
||||
planned_task: PlannedTask,
|
||||
form_data: dict[str, str | list[str]],
|
||||
) -> SubmitResponse:
|
||||
try:
|
||||
return _api_client.submit_parse_task_sync(
|
||||
base_url=base_url,
|
||||
upload_assets=[
|
||||
_api_client.UploadAsset(
|
||||
path=document.path,
|
||||
upload_name=f"{document.stem}{document.path.suffix}",
|
||||
)
|
||||
for document in planned_task.documents
|
||||
],
|
||||
form_data=form_data,
|
||||
)
|
||||
except click.ClickException as exc:
|
||||
raise click.ClickException(
|
||||
f"Failed to submit {format_task_label(planned_task)}: {exc}"
|
||||
) from exc
|
||||
|
||||
|
||||
async def wait_for_task_result(
|
||||
client: httpx.AsyncClient,
|
||||
submit_response: SubmitResponse,
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import asyncio
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
@@ -269,19 +268,6 @@ def validate_parse_method(parse_method: str) -> str:
|
||||
return parse_method
|
||||
|
||||
|
||||
def sanitize_filename(filename: str) -> str:
|
||||
"""
|
||||
格式化压缩文件的文件名
|
||||
移除路径遍历字符, 保留 Unicode 字母、数字、._-
|
||||
禁止隐藏文件
|
||||
"""
|
||||
sanitized = re.sub(r"[/\\.]{2,}|[/\\]", "", filename)
|
||||
sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
|
||||
if sanitized.startswith("."):
|
||||
sanitized = "_" + sanitized[1:]
|
||||
return sanitized or "unnamed"
|
||||
|
||||
|
||||
def cleanup_file(file_path: str) -> None:
|
||||
"""清理临时文件或目录"""
|
||||
try:
|
||||
|
||||
@@ -287,14 +287,10 @@ def _offset_paragraph_title_levels(levels_by_index):
|
||||
if not levels_by_index:
|
||||
return levels_by_index
|
||||
|
||||
positive_levels = [level for level in levels_by_index.values() if level > 0]
|
||||
if positive_levels and min(positive_levels) == 1:
|
||||
return {
|
||||
index: level + 1 if level > 0 else level
|
||||
for index, level in levels_by_index.items()
|
||||
}
|
||||
|
||||
return levels_by_index
|
||||
return {
|
||||
index: 2 if level == 1 else level
|
||||
for index, level in levels_by_index.items()
|
||||
}
|
||||
|
||||
|
||||
def _request_paragraph_group_levels(title_block_refs, title_aided_config):
|
||||
|
||||
Reference in New Issue
Block a user