mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
refactor: enhance title hierarchy logic and update model configuration
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
"title_aided": {
|
||||
"api_key": "your_api_key",
|
||||
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
"model": "qwen2.5-32b-instruct",
|
||||
"model": "qwen3-next-80b-a3b-instruct",
|
||||
"enable": false
|
||||
}
|
||||
},
|
||||
|
||||
@@ -17,8 +17,10 @@ from ...model.table.rec.unet_table.main import UnetTableModel
|
||||
from ...utils.enum_class import ModelPath
|
||||
from ...utils.models_download_utils import auto_download_and_get_model_root_path
|
||||
|
||||
MFR_MODEL = "unimernet_small"
|
||||
# MFR_MODEL = "pp_formulanet_plus_m"
|
||||
MFR_MODEL = os.getenv('MINERU_MFR_MODEL', None)
|
||||
if MFR_MODEL is None:
|
||||
# MFR_MODEL = "unimernet_small"
|
||||
MFR_MODEL = "pp_formulanet_plus_m"
|
||||
|
||||
|
||||
def img_orientation_cls_model_init():
|
||||
|
||||
@@ -6,9 +6,13 @@ import re
|
||||
|
||||
from PIL import Image, ImageOps
|
||||
from typing import List, Optional, Tuple, Union, Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
from tokenizers import AddedToken
|
||||
from tokenizers import Tokenizer as TokenizerFast
|
||||
|
||||
from mineru.model.mfr.unimernet.unimernet_hf.modeling_unimernet import fix_latex_left_right
|
||||
|
||||
|
||||
class UniMERNetImgDecode(object):
|
||||
"""Class for decoding images for UniMERNet, including cropping margins, resizing, and padding."""
|
||||
@@ -589,6 +593,7 @@ class UniMERNetDecode(object):
|
||||
replaced_formula = pattern.sub(replacer, formula)
|
||||
return replaced_formula.replace('"', "")
|
||||
|
||||
UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
|
||||
def post_process(self, text: str) -> str:
|
||||
"""Post-processes a string by fixing text and normalizing it.
|
||||
|
||||
@@ -602,6 +607,10 @@ class UniMERNetDecode(object):
|
||||
|
||||
text = self.remove_chinese_text_wrapping(text)
|
||||
text = fix_text(text)
|
||||
text = fix_latex_left_right(text)
|
||||
text = self.UP_PATTERN.sub(
|
||||
lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", text
|
||||
)
|
||||
text = self.normalize(text)
|
||||
return text
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ def llm_aided_title(page_info_list, title_aided_config):
|
||||
3. 保持字典内key-value的对应关系不变
|
||||
|
||||
4. 优化层次结构:
|
||||
- 为每个标题元素添加适当的层次结构
|
||||
- 根据标题内容的语义为每个标题元素添加适当的层次结构
|
||||
- 行高较大的标题一般是更高级别的标题
|
||||
- 标题从前至后的层级必须是连续的,不能跳过层级
|
||||
- 标题层级最多为4级,不要添加过多的层级
|
||||
@@ -61,7 +61,6 @@ def llm_aided_title(page_info_list, title_aided_config):
|
||||
- 在完成初步分级后,仔细检查分级结果的合理性
|
||||
- 根据上下文关系和逻辑顺序,对不合理的分级进行微调
|
||||
- 确保最终的分级结果符合文档的实际结构和逻辑
|
||||
- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
|
||||
|
||||
IMPORTANT:
|
||||
请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
|
||||
@@ -78,6 +77,8 @@ Input title list:
|
||||
|
||||
Corrected title list:
|
||||
"""
|
||||
#5.
|
||||
#- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
|
||||
|
||||
retry_count = 0
|
||||
max_retries = 3
|
||||
@@ -89,6 +90,7 @@ Corrected title list:
|
||||
model=title_aided_config["model"],
|
||||
messages=[
|
||||
{'role': 'user', 'content': title_optimize_prompt}],
|
||||
extra_body={"enable_thinking": False},
|
||||
temperature=0.7,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user