mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat(performance): add performance monitoring and optimization
- Add performance_stats module to measure and print execution time statistics - Implement measure_time decorator to track execution time of key functions - Remove multi-threading in pdf parsing for better resource management - Optimize pdf parsing logic for improved performance
This commit is contained in:
54
magic_pdf/libs/performance_stats.py
Normal file
54
magic_pdf/libs/performance_stats.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import time
|
||||
import functools
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
class PerformanceStats:
|
||||
"""性能统计类,用于收集和展示方法执行时间"""
|
||||
|
||||
_stats: Dict[str, List[float]] = defaultdict(list)
|
||||
|
||||
@classmethod
|
||||
def add_execution_time(cls, func_name: str, execution_time: float):
|
||||
"""添加执行时间记录"""
|
||||
cls._stats[func_name].append(execution_time)
|
||||
|
||||
@classmethod
|
||||
def get_stats(cls) -> Dict[str, dict]:
|
||||
"""获取统计结果"""
|
||||
results = {}
|
||||
for func_name, times in cls._stats.items():
|
||||
results[func_name] = {
|
||||
'count': len(times),
|
||||
'total_time': sum(times),
|
||||
'avg_time': sum(times) / len(times),
|
||||
'min_time': min(times),
|
||||
'max_time': max(times)
|
||||
}
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
def print_stats(cls):
|
||||
"""打印统计结果"""
|
||||
stats = cls.get_stats()
|
||||
print("\n性能统计结果:")
|
||||
print("-" * 80)
|
||||
print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}")
|
||||
print("-" * 80)
|
||||
for func_name, data in stats.items():
|
||||
print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}")
|
||||
|
||||
|
||||
def measure_time(func):
|
||||
"""测量方法执行时间的装饰器"""
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time.time()
|
||||
result = func(*args, **kwargs)
|
||||
execution_time = time.time() - start_time
|
||||
PerformanceStats.add_execution_time(func.__name__, execution_time)
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
@@ -21,6 +21,7 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
|
||||
from magic_pdf.libs.convert_utils import dict_to_list
|
||||
from magic_pdf.libs.hash_utils import compute_md5
|
||||
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
||||
from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
|
||||
from magic_pdf.model.magic_model import MagicModel
|
||||
from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
|
||||
|
||||
@@ -217,7 +218,7 @@ def calculate_contrast(img, img_mode) -> float:
|
||||
# logger.info(f"contrast: {contrast}")
|
||||
return round(contrast, 2)
|
||||
|
||||
|
||||
@measure_time
|
||||
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
||||
# cid用0xfffd表示,连字符拆开
|
||||
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
|
||||
@@ -491,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
|
||||
else:
|
||||
return [[x0, y0, x1, y1]]
|
||||
|
||||
|
||||
@measure_time
|
||||
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
||||
page_line_list = []
|
||||
|
||||
@@ -925,7 +926,6 @@ def pdf_parse_union(
|
||||
magic_model = MagicModel(model_list, dataset)
|
||||
|
||||
"""根据输入的起始范围解析pdf"""
|
||||
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
|
||||
end_page_id = (
|
||||
end_page_id
|
||||
if end_page_id is not None and end_page_id >= 0
|
||||
@@ -939,33 +939,16 @@ def pdf_parse_union(
|
||||
"""初始化启动时间"""
|
||||
start_time = time.time()
|
||||
|
||||
# for page_id, page in enumerate(dataset):
|
||||
# """debug时输出每页解析的耗时."""
|
||||
# if debug_mode:
|
||||
# time_now = time.time()
|
||||
# logger.info(
|
||||
# f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
|
||||
# )
|
||||
# start_time = time_now
|
||||
#
|
||||
# """解析pdf中的每一页"""
|
||||
# if start_page_id <= page_id <= end_page_id:
|
||||
# page_info = parse_page_core(
|
||||
# page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
|
||||
# )
|
||||
# else:
|
||||
# page_info = page.get_page_info()
|
||||
# page_w = page_info.w
|
||||
# page_h = page_info.h
|
||||
# page_info = ocr_construct_page_component_v2(
|
||||
# [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
|
||||
# )
|
||||
# pdf_info_dict[f'page_{page_id}'] = page_info
|
||||
def process_page(page_id, page, dataset_len, start_page_id, end_page_id, magic_model, pdf_bytes_md5, imageWriter,
|
||||
parse_mode, lang, debug_mode, start_time):
|
||||
for page_id, page in enumerate(dataset):
|
||||
"""debug时输出每页解析的耗时."""
|
||||
if debug_mode:
|
||||
time_now = time.time()
|
||||
logger.info(
|
||||
f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
|
||||
)
|
||||
start_time = time_now
|
||||
|
||||
"""解析pdf中的每一页"""
|
||||
if start_page_id <= page_id <= end_page_id:
|
||||
page_info = parse_page_core(
|
||||
page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
|
||||
@@ -977,44 +960,15 @@ def pdf_parse_union(
|
||||
page_info = ocr_construct_page_component_v2(
|
||||
[], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
|
||||
)
|
||||
return page_id, page_info
|
||||
pdf_info_dict[f'page_{page_id}'] = page_info
|
||||
|
||||
# Use max_workers based on CPU count but limit to avoid excessive resource usage
|
||||
max_workers = 2
|
||||
pdf_info_dict = {}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {
|
||||
executor.submit(
|
||||
process_page,
|
||||
page_id,
|
||||
page,
|
||||
len(dataset),
|
||||
start_page_id,
|
||||
end_page_id,
|
||||
magic_model,
|
||||
pdf_bytes_md5,
|
||||
imageWriter,
|
||||
parse_mode,
|
||||
lang,
|
||||
debug_mode,
|
||||
time.time()
|
||||
): page_id
|
||||
for page_id, page in enumerate(dataset)
|
||||
}
|
||||
|
||||
for page_id in range(len(dataset)):
|
||||
future = [f for f in futures if futures[f] == page_id][0]
|
||||
try:
|
||||
page_id, page_info = future.result()
|
||||
pdf_info_dict[f'page_{page_id}'] = page_info
|
||||
except Exception as e:
|
||||
logger.exception(f"Error processing page {page_id}: {e}")
|
||||
|
||||
logger.info(
|
||||
f'page_process_time: {round(time.time() - start_time, 2)}'
|
||||
)
|
||||
|
||||
PerformanceStats.print_stats()
|
||||
|
||||
"""分段"""
|
||||
para_split(pdf_info_dict)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user