refactor: enhance title hierarchy logic and update model configuration

This commit is contained in:
myhloli
2025-10-22 15:57:07 +08:00
parent ab480a7a86
commit a220b8a208
4 changed files with 18 additions and 5 deletions

View File

@@ -17,7 +17,7 @@
"title_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-32b-instruct",
"model": "qwen3-next-80b-a3b-instruct",
"enable": false
}
},

View File

@@ -17,8 +17,10 @@ from ...model.table.rec.unet_table.main import UnetTableModel
from ...utils.enum_class import ModelPath
from ...utils.models_download_utils import auto_download_and_get_model_root_path
MFR_MODEL = "unimernet_small"
# MFR_MODEL = "pp_formulanet_plus_m"
MFR_MODEL = os.getenv('MINERU_MFR_MODEL', None)
if MFR_MODEL is None:
# MFR_MODEL = "unimernet_small"
MFR_MODEL = "pp_formulanet_plus_m"
def img_orientation_cls_model_init():

View File

@@ -6,9 +6,13 @@ import re
from PIL import Image, ImageOps
from typing import List, Optional, Tuple, Union, Dict, Any
from loguru import logger
from tokenizers import AddedToken
from tokenizers import Tokenizer as TokenizerFast
from mineru.model.mfr.unimernet.unimernet_hf.modeling_unimernet import fix_latex_left_right
class UniMERNetImgDecode(object):
"""Class for decoding images for UniMERNet, including cropping margins, resizing, and padding."""
@@ -589,6 +593,7 @@ class UniMERNetDecode(object):
replaced_formula = pattern.sub(replacer, formula)
return replaced_formula.replace('"', "")
UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
def post_process(self, text: str) -> str:
"""Post-processes a string by fixing text and normalizing it.
@@ -602,6 +607,10 @@ class UniMERNetDecode(object):
text = self.remove_chinese_text_wrapping(text)
text = fix_text(text)
text = fix_latex_left_right(text)
text = self.UP_PATTERN.sub(
lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", text
)
text = self.normalize(text)
return text

View File

@@ -51,7 +51,7 @@ def llm_aided_title(page_info_list, title_aided_config):
3. 保持字典内key-value的对应关系不变
4. 优化层次结构:
- 为每个标题元素添加适当的层次结构
- 根据标题内容的语义为每个标题元素添加适当的层次结构
- 行高较大的标题一般是更高级别的标题
- 标题从前至后的层级必须是连续的,不能跳过层级
- 标题层级最多为4级不要添加过多的层级
@@ -61,7 +61,6 @@ def llm_aided_title(page_info_list, title_aided_config):
- 在完成初步分级后,仔细检查分级结果的合理性
- 根据上下文关系和逻辑顺序,对不合理的分级进行微调
- 确保最终的分级结果符合文档的实际结构和逻辑
- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
IMPORTANT:
请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
@@ -78,6 +77,8 @@ Input title list:
Corrected title list:
"""
#5.
#- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
retry_count = 0
max_retries = 3
@@ -89,6 +90,7 @@ Corrected title list:
model=title_aided_config["model"],
messages=[
{'role': 'user', 'content': title_optimize_prompt}],
extra_body={"enable_thinking": False},
temperature=0.7,
stream=True,
)