mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 02:58:54 +07:00
feat: enhance logging by adding dynamic log level configuration and performance metrics
This commit is contained in:
@@ -107,7 +107,7 @@ def do_parse(
|
||||
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
||||
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
||||
middle_json, infer_result = hybrid_doc_analyze(
|
||||
middle_json, infer_result, _vlm_ocr_enable = hybrid_doc_analyze(
|
||||
pdf_bytes,
|
||||
image_writer=image_writer,
|
||||
backend=backend,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import cv2
|
||||
@@ -397,8 +398,11 @@ def doc_analyze(
|
||||
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
||||
|
||||
# 加载图像
|
||||
load_images_start = time.time()
|
||||
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
||||
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
||||
load_images_time = round(time.time() - load_images_start, 2)
|
||||
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
||||
|
||||
# 获取设备信息
|
||||
device = get_device()
|
||||
@@ -407,6 +411,7 @@ def doc_analyze(
|
||||
_ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method)
|
||||
_vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable)
|
||||
|
||||
infer_start = time.time()
|
||||
# VLM提取
|
||||
if _vlm_ocr_enable:
|
||||
results = predictor.batch_two_step_extract(images=images_pil_list)
|
||||
@@ -428,6 +433,8 @@ def doc_analyze(
|
||||
batch_radio=batch_ratio,
|
||||
)
|
||||
_normalize_bbox(inline_formula_list, ocr_res_list, images_pil_list)
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
||||
|
||||
# 生成中间JSON
|
||||
middle_json = result_to_middle_json(
|
||||
@@ -463,8 +470,11 @@ async def aio_doc_analyze(
|
||||
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
||||
|
||||
# 加载图像
|
||||
load_images_start = time.time()
|
||||
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
||||
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
||||
load_images_time = round(time.time() - load_images_start, 2)
|
||||
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
||||
|
||||
# 获取设备信息
|
||||
device = get_device()
|
||||
@@ -473,6 +483,7 @@ async def aio_doc_analyze(
|
||||
_ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method)
|
||||
_vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable)
|
||||
|
||||
infer_start = time.time()
|
||||
# VLM提取
|
||||
if _vlm_ocr_enable:
|
||||
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
|
||||
@@ -494,6 +505,8 @@ async def aio_doc_analyze(
|
||||
batch_radio=batch_ratio,
|
||||
)
|
||||
_normalize_bbox(inline_formula_list, ocr_res_list, images_pil_list)
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
||||
|
||||
# 生成中间JSON
|
||||
middle_json = result_to_middle_json(
|
||||
|
||||
@@ -86,6 +86,7 @@ def doc_analyze(
|
||||
all_image_lists = []
|
||||
all_pdf_docs = []
|
||||
ocr_enabled_list = []
|
||||
load_images_start = time.time()
|
||||
for pdf_idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
# 确定OCR设置
|
||||
_ocr_enable = False
|
||||
@@ -99,10 +100,7 @@ def doc_analyze(
|
||||
_lang = lang_list[pdf_idx]
|
||||
|
||||
# 收集每个数据集中的页面
|
||||
# load_images_start = time.time()
|
||||
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
||||
# load_images_time = round(time.time() - load_images_start, 2)
|
||||
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
|
||||
all_image_lists.append(images_list)
|
||||
all_pdf_docs.append(pdf_doc)
|
||||
for page_idx in range(len(images_list)):
|
||||
@@ -111,6 +109,8 @@ def doc_analyze(
|
||||
pdf_idx, page_idx,
|
||||
img_dict['img_pil'], _ocr_enable, _lang,
|
||||
))
|
||||
load_images_time = round(time.time() - load_images_start, 2)
|
||||
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(all_pages_info) / load_images_time, 3)} images/s")
|
||||
|
||||
# 准备批处理
|
||||
images_with_extra_info = [(info[2], info[3], info[4]) for info in all_pages_info]
|
||||
@@ -123,6 +123,7 @@ def doc_analyze(
|
||||
# 执行批处理
|
||||
results = []
|
||||
processed_images_count = 0
|
||||
infer_start = time.time()
|
||||
for index, batch_image in enumerate(batch_images):
|
||||
processed_images_count += len(batch_image)
|
||||
logger.info(
|
||||
@@ -131,6 +132,8 @@ def doc_analyze(
|
||||
)
|
||||
batch_results = batch_image_analyze(batch_image, formula_enable, table_enable)
|
||||
results.extend(batch_results)
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results) / infer_time, 3)} page/s")
|
||||
|
||||
# 构建返回结果
|
||||
infer_results = []
|
||||
|
||||
@@ -202,16 +202,16 @@ def doc_analyze(
|
||||
if predictor is None:
|
||||
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
||||
|
||||
# load_images_start = time.time()
|
||||
load_images_start = time.time()
|
||||
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
||||
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
||||
# load_images_time = round(time.time() - load_images_start, 2)
|
||||
# logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
|
||||
load_images_time = round(time.time() - load_images_start, 2)
|
||||
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
||||
|
||||
# infer_start = time.time()
|
||||
infer_start = time.time()
|
||||
results = predictor.batch_two_step_extract(images=images_pil_list)
|
||||
# infer_time = round(time.time() - infer_start, 2)
|
||||
# logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
||||
|
||||
middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
|
||||
return middle_json, results
|
||||
@@ -229,15 +229,15 @@ async def aio_doc_analyze(
|
||||
if predictor is None:
|
||||
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
||||
|
||||
# load_images_start = time.time()
|
||||
load_images_start = time.time()
|
||||
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
||||
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
||||
# load_images_time = round(time.time() - load_images_start, 2)
|
||||
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
||||
load_images_time = round(time.time() - load_images_start, 2)
|
||||
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
||||
|
||||
# infer_start = time.time()
|
||||
infer_start = time.time()
|
||||
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
|
||||
# infer_time = round(time.time() - infer_start, 2)
|
||||
# logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
||||
middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
|
||||
return middle_json, results
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import os
|
||||
import sys
|
||||
|
||||
import click
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
|
||||
logger.remove() # 移除默认handler
|
||||
logger.add(sys.stderr, level=log_level) # 添加新handler
|
||||
|
||||
from mineru.utils.cli_parser import arg_parse
|
||||
from mineru.utils.config_reader import get_device
|
||||
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import sys
|
||||
import uuid
|
||||
import os
|
||||
import re
|
||||
@@ -14,6 +15,11 @@ from fastapi.responses import JSONResponse, FileResponse
|
||||
from starlette.background import BackgroundTask
|
||||
from typing import List, Optional
|
||||
from loguru import logger
|
||||
|
||||
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
|
||||
logger.remove() # 移除默认handler
|
||||
logger.add(sys.stderr, level=log_level) # 添加新handler
|
||||
|
||||
from base64 import b64encode
|
||||
|
||||
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
@@ -12,6 +13,10 @@ import gradio as gr
|
||||
from gradio_pdf import PDF
|
||||
from loguru import logger
|
||||
|
||||
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
|
||||
logger.remove() # 移除默认handler
|
||||
logger.add(sys.stderr, level=log_level) # 添加新handler
|
||||
|
||||
from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
|
||||
from mineru.utils.cli_parser import arg_parse
|
||||
from mineru.utils.engine_utils import get_vlm_engine
|
||||
|
||||
@@ -29,7 +29,7 @@ dependencies = [
|
||||
"pypdfium2>=4.30.0",
|
||||
"pypdf>=5.6.0",
|
||||
"reportlab",
|
||||
"pdftext>=0.6.2",
|
||||
"pdftext>=0.6.3",
|
||||
"modelscope>=1.26.0",
|
||||
"huggingface-hub>=0.32.4",
|
||||
"json-repair>=0.46.2",
|
||||
|
||||
Reference in New Issue
Block a user