refactor: enhance title hierarchy logic and update model configuration

2026-03-27 11:08:32 +07:00 · 2025-10-22 15:57:07 +08:00
parent ab480a7a86
commit a220b8a208
4 changed files with 18 additions and 5 deletions
--- a/mineru.template.json
+++ b/mineru.template.json
@@ -17,7 +17,7 @@
        "title_aided": {
            "api_key": "your_api_key",
            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
-            "model": "qwen2.5-32b-instruct",
+            "model": "qwen3-next-80b-a3b-instruct",
            "enable": false
        }
    },
--- a/mineru/backend/pipeline/model_init.py
+++ b/mineru/backend/pipeline/model_init.py
@@ -17,8 +17,10 @@ from ...model.table.rec.unet_table.main import UnetTableModel
 from ...utils.enum_class import ModelPath
 from ...utils.models_download_utils import auto_download_and_get_model_root_path

-MFR_MODEL = "unimernet_small"
-# MFR_MODEL = "pp_formulanet_plus_m"
+MFR_MODEL = os.getenv('MINERU_MFR_MODEL', None)
+if MFR_MODEL is None:
+    # MFR_MODEL = "unimernet_small"
+    MFR_MODEL = "pp_formulanet_plus_m"


 def img_orientation_cls_model_init():
--- a/mineru/model/mfr/pp_formulanet_plus_m/processors.py
+++ b/mineru/model/mfr/pp_formulanet_plus_m/processors.py
@@ -6,9 +6,13 @@ import re

 from PIL import Image, ImageOps
 from typing import List, Optional, Tuple, Union, Dict, Any
+
+from loguru import logger
 from tokenizers import AddedToken
 from tokenizers import Tokenizer as TokenizerFast

+from mineru.model.mfr.unimernet.unimernet_hf.modeling_unimernet import fix_latex_left_right
+

 class UniMERNetImgDecode(object):
    """Class for decoding images for UniMERNet, including cropping margins, resizing, and padding."""
@@ -589,6 +593,7 @@ class UniMERNetDecode(object):
        replaced_formula = pattern.sub(replacer, formula)
        return replaced_formula.replace('"', "")

+    UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
    def post_process(self, text: str) -> str:
        """Post-processes a string by fixing text and normalizing it.

@@ -602,6 +607,10 @@ class UniMERNetDecode(object):

        text = self.remove_chinese_text_wrapping(text)
        text = fix_text(text)
+        text = fix_latex_left_right(text)
+        text = self.UP_PATTERN.sub(
+            lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", text
+        )
        text = self.normalize(text)
        return text

--- a/mineru/utils/llm_aided.py
+++ b/mineru/utils/llm_aided.py
@@ -51,7 +51,7 @@ def llm_aided_title(page_info_list, title_aided_config):
 3. 保持字典内key-value的对应关系不变

 4. 优化层次结构：
-    - 为每个标题元素添加适当的层次结构
+    - 根据标题内容的语义为每个标题元素添加适当的层次结构
    - 行高较大的标题一般是更高级别的标题
    - 标题从前至后的层级必须是连续的，不能跳过层级
    - 标题层级最多为4级，不要添加过多的层级
@@ -61,7 +61,6 @@ def llm_aided_title(page_info_list, title_aided_config):
    - 在完成初步分级后，仔细检查分级结果的合理性
    - 根据上下文关系和逻辑顺序，对不合理的分级进行微调
    - 确保最终的分级结果符合文档的实际结构和逻辑
-    - 字典中可能包含被误当成标题的正文，你可以通过将其层级标记为 0 来排除它们

 IMPORTANT: 
 请直接返回优化过的由标题层级组成的字典，格式为{{标题id:标题层级}}，如下：
@@ -78,6 +77,8 @@ Input title list:

 Corrected title list:
 """
+    #5.
+    #- 字典中可能包含被误当成标题的正文，你可以通过将其层级标记为 0 来排除它们

    retry_count = 0
    max_retries = 3
@@ -89,6 +90,7 @@ Corrected title list:
                model=title_aided_config["model"],
                messages=[
                    {'role': 'user', 'content': title_optimize_prompt}],
+                extra_body={"enable_thinking": False},
                temperature=0.7,
                stream=True,
            )