feat(performance): add performance monitoring and optimization

- Add performance_stats module to measure and print execution time statistics - Implement measure_time decorator to track execution time of key functions - Remove multi-threading in pdf parsing for better resource management - Optimize pdf parsing logic for improved performance
2026-03-27 11:08:32 +07:00 · 2025-03-03 14:52:48 +08:00
parent 6ec440d6f1
commit e516cf535c
2 changed files with 67 additions and 59 deletions
--- a/magic_pdf/libs/performance_stats.py
+++ b/magic_pdf/libs/performance_stats.py
@@ -0,0 +1,54 @@
+import time
+import functools
+from collections import defaultdict
+from typing import Dict, List
+
+
+class PerformanceStats:
+    """性能统计类，用于收集和展示方法执行时间"""
+
+    _stats: Dict[str, List[float]] = defaultdict(list)
+
+    @classmethod
+    def add_execution_time(cls, func_name: str, execution_time: float):
+        """添加执行时间记录"""
+        cls._stats[func_name].append(execution_time)
+
+    @classmethod
+    def get_stats(cls) -> Dict[str, dict]:
+        """获取统计结果"""
+        results = {}
+        for func_name, times in cls._stats.items():
+            results[func_name] = {
+                'count': len(times),
+                'total_time': sum(times),
+                'avg_time': sum(times) / len(times),
+                'min_time': min(times),
+                'max_time': max(times)
+            }
+        return results
+
+    @classmethod
+    def print_stats(cls):
+        """打印统计结果"""
+        stats = cls.get_stats()
+        print("\n性能统计结果:")
+        print("-" * 80)
+        print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}")
+        print("-" * 80)
+        for func_name, data in stats.items():
+            print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}")
+
+
+def measure_time(func):
+    """测量方法执行时间的装饰器"""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        execution_time = time.time() - start_time
+        PerformanceStats.add_execution_time(func.__name__, execution_time)
+        return result
+
+    return wrapper
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -21,6 +21,7 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
+from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
 from magic_pdf.model.magic_model import MagicModel
 from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title

@@ -217,7 +218,7 @@ def calculate_contrast(img, img_mode) -> float:
    # logger.info(f"contrast: {contrast}")
    return round(contrast, 2)

-
+@measure_time
 def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
    # cid用0xfffd表示，连字符拆开
    # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
@@ -491,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
    else:
        return [[x0, y0, x1, y1]]

-
+@measure_time
 def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
    page_line_list = []

@@ -925,7 +926,6 @@ def pdf_parse_union(
    magic_model = MagicModel(model_list, dataset)

    """根据输入的起始范围解析pdf"""
-    # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
    end_page_id = (
        end_page_id
        if end_page_id is not None and end_page_id >= 0
@@ -939,33 +939,16 @@ def pdf_parse_union(
    """初始化启动时间"""
    start_time = time.time()

-    # for page_id, page in enumerate(dataset):
-    #     """debug时输出每页解析的耗时."""
-    #     if debug_mode:
-    #         time_now = time.time()
-    #         logger.info(
-    #             f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
-    #         )
-    #         start_time = time_now
-    #
-    #     """解析pdf中的每一页"""
-    #     if start_page_id <= page_id <= end_page_id:
-    #         page_info = parse_page_core(
-    #             page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
-    #         )
-    #     else:
-    #         page_info = page.get_page_info()
-    #         page_w = page_info.w
-    #         page_h = page_info.h
-    #         page_info = ocr_construct_page_component_v2(
-    #             [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
-    #         )
-    #     pdf_info_dict[f'page_{page_id}'] = page_info
-    def process_page(page_id, page, dataset_len, start_page_id, end_page_id, magic_model, pdf_bytes_md5, imageWriter,
-                     parse_mode, lang, debug_mode, start_time):
+    for page_id, page in enumerate(dataset):
+        """debug时输出每页解析的耗时."""
        if debug_mode:
            time_now = time.time()
+            logger.info(
+                f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
+            )
+            start_time = time_now

+        """解析pdf中的每一页"""
        if start_page_id <= page_id <= end_page_id:
            page_info = parse_page_core(
                page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
@@ -977,44 +960,15 @@ def pdf_parse_union(
            page_info = ocr_construct_page_component_v2(
                [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
            )
-        return page_id, page_info
+        pdf_info_dict[f'page_{page_id}'] = page_info

-    # Use max_workers based on CPU count but limit to avoid excessive resource usage
-    max_workers = 2
-    pdf_info_dict = {}
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = {
-            executor.submit(
-                process_page,
-                page_id,
-                page,
-                len(dataset),
-                start_page_id,
-                end_page_id,
-                magic_model,
-                pdf_bytes_md5,
-                imageWriter,
-                parse_mode,
-                lang,
-                debug_mode,
-                time.time()
-            ): page_id
-            for page_id, page in enumerate(dataset)
-        }
-
-        for page_id in range(len(dataset)):
-            future = [f for f in futures if futures[f] == page_id][0]
-            try:
-                page_id, page_info = future.result()
-                pdf_info_dict[f'page_{page_id}'] = page_info
-            except Exception as e:
-                logger.exception(f"Error processing page {page_id}: {e}")

    logger.info(
        f'page_process_time: {round(time.time() - start_time, 2)}'
    )

+    PerformanceStats.print_stats()
+
    """分段"""
    para_split(pdf_info_dict)