feat: enhance logging by adding dynamic log level configuration and performance metrics

This commit is contained in:
myhloli
2025-12-30 16:43:24 +08:00
parent 466b85ba3f
commit 05b6ed3d8d
8 changed files with 50 additions and 17 deletions

View File

@@ -107,7 +107,7 @@ def do_parse(
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = hybrid_doc_analyze(
middle_json, infer_result, _vlm_ocr_enable = hybrid_doc_analyze(
pdf_bytes,
image_writer=image_writer,
backend=backend,

View File

@@ -1,5 +1,6 @@
# Copyright (c) Opendatalab. All rights reserved.
import os
import time
from collections import defaultdict
import cv2
@@ -397,8 +398,11 @@ def doc_analyze(
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
# 加载图像
load_images_start = time.time()
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
load_images_time = round(time.time() - load_images_start, 2)
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
# 获取设备信息
device = get_device()
@@ -407,6 +411,7 @@ def doc_analyze(
_ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method)
_vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable)
infer_start = time.time()
# VLM提取
if _vlm_ocr_enable:
results = predictor.batch_two_step_extract(images=images_pil_list)
@@ -428,6 +433,8 @@ def doc_analyze(
batch_radio=batch_ratio,
)
_normalize_bbox(inline_formula_list, ocr_res_list, images_pil_list)
infer_time = round(time.time() - infer_start, 2)
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
# 生成中间JSON
middle_json = result_to_middle_json(
@@ -463,8 +470,11 @@ async def aio_doc_analyze(
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
# 加载图像
load_images_start = time.time()
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
load_images_time = round(time.time() - load_images_start, 2)
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
# 获取设备信息
device = get_device()
@@ -473,6 +483,7 @@ async def aio_doc_analyze(
_ocr_enable = ocr_classify(pdf_bytes, parse_method=parse_method)
_vlm_ocr_enable = _should_enable_vlm_ocr(_ocr_enable, language, inline_formula_enable)
infer_start = time.time()
# VLM提取
if _vlm_ocr_enable:
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
@@ -494,6 +505,8 @@ async def aio_doc_analyze(
batch_radio=batch_ratio,
)
_normalize_bbox(inline_formula_list, ocr_res_list, images_pil_list)
infer_time = round(time.time() - infer_start, 2)
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
# 生成中间JSON
middle_json = result_to_middle_json(

View File

@@ -86,6 +86,7 @@ def doc_analyze(
all_image_lists = []
all_pdf_docs = []
ocr_enabled_list = []
load_images_start = time.time()
for pdf_idx, pdf_bytes in enumerate(pdf_bytes_list):
# 确定OCR设置
_ocr_enable = False
@@ -99,10 +100,7 @@ def doc_analyze(
_lang = lang_list[pdf_idx]
# 收集每个数据集中的页面
# load_images_start = time.time()
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
# load_images_time = round(time.time() - load_images_start, 2)
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
all_image_lists.append(images_list)
all_pdf_docs.append(pdf_doc)
for page_idx in range(len(images_list)):
@@ -111,6 +109,8 @@ def doc_analyze(
pdf_idx, page_idx,
img_dict['img_pil'], _ocr_enable, _lang,
))
load_images_time = round(time.time() - load_images_start, 2)
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(all_pages_info) / load_images_time, 3)} images/s")
# 准备批处理
images_with_extra_info = [(info[2], info[3], info[4]) for info in all_pages_info]
@@ -123,6 +123,7 @@ def doc_analyze(
# 执行批处理
results = []
processed_images_count = 0
infer_start = time.time()
for index, batch_image in enumerate(batch_images):
processed_images_count += len(batch_image)
logger.info(
@@ -131,6 +132,8 @@ def doc_analyze(
)
batch_results = batch_image_analyze(batch_image, formula_enable, table_enable)
results.extend(batch_results)
infer_time = round(time.time() - infer_start, 2)
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results) / infer_time, 3)} page/s")
# 构建返回结果
infer_results = []

View File

@@ -202,16 +202,16 @@ def doc_analyze(
if predictor is None:
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
# load_images_start = time.time()
load_images_start = time.time()
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
# load_images_time = round(time.time() - load_images_start, 2)
# logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
load_images_time = round(time.time() - load_images_start, 2)
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
# infer_start = time.time()
infer_start = time.time()
results = predictor.batch_two_step_extract(images=images_pil_list)
# infer_time = round(time.time() - infer_start, 2)
# logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
infer_time = round(time.time() - infer_start, 2)
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
return middle_json, results
@@ -229,15 +229,15 @@ async def aio_doc_analyze(
if predictor is None:
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
# load_images_start = time.time()
load_images_start = time.time()
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
# load_images_time = round(time.time() - load_images_start, 2)
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
load_images_time = round(time.time() - load_images_start, 2)
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
# infer_start = time.time()
infer_start = time.time()
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
# infer_time = round(time.time() - infer_start, 2)
# logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
infer_time = round(time.time() - infer_start, 2)
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
return middle_json, results

View File

@@ -1,9 +1,15 @@
# Copyright (c) Opendatalab. All rights reserved.
import os
import sys
import click
from pathlib import Path
from loguru import logger
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
logger.remove() # 移除默认handler
logger.add(sys.stderr, level=log_level) # 添加新handler
from mineru.utils.cli_parser import arg_parse
from mineru.utils.config_reader import get_device
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path

View File

@@ -1,3 +1,4 @@
import sys
import uuid
import os
import re
@@ -14,6 +15,11 @@ from fastapi.responses import JSONResponse, FileResponse
from starlette.background import BackgroundTask
from typing import List, Optional
from loguru import logger
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
logger.remove() # 移除默认handler
logger.add(sys.stderr, level=log_level) # 添加新handler
from base64 import b64encode
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes

View File

@@ -3,6 +3,7 @@
import base64
import os
import re
import sys
import time
import zipfile
from pathlib import Path
@@ -12,6 +13,10 @@ import gradio as gr
from gradio_pdf import PDF
from loguru import logger
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
logger.remove() # 移除默认handler
logger.add(sys.stderr, level=log_level) # 添加新handler
from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
from mineru.utils.cli_parser import arg_parse
from mineru.utils.engine_utils import get_vlm_engine

View File

@@ -29,7 +29,7 @@ dependencies = [
"pypdfium2>=4.30.0",
"pypdf>=5.6.0",
"reportlab",
"pdftext>=0.6.2",
"pdftext>=0.6.3",
"modelscope>=1.26.0",
"huggingface-hub>=0.32.4",
"json-repair>=0.46.2",