diff --git a/mineru.template.json b/mineru.template.json index 9b9971db..4cf1c152 100644 --- a/mineru.template.json +++ b/mineru.template.json @@ -17,7 +17,7 @@ "title_aided": { "api_key": "your_api_key", "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", - "model": "qwen2.5-32b-instruct", + "model": "qwen3-next-80b-a3b-instruct", "enable": false } }, diff --git a/mineru/backend/pipeline/model_init.py b/mineru/backend/pipeline/model_init.py index c30ac95b..c22a3651 100644 --- a/mineru/backend/pipeline/model_init.py +++ b/mineru/backend/pipeline/model_init.py @@ -17,8 +17,10 @@ from ...model.table.rec.unet_table.main import UnetTableModel from ...utils.enum_class import ModelPath from ...utils.models_download_utils import auto_download_and_get_model_root_path -MFR_MODEL = "unimernet_small" -# MFR_MODEL = "pp_formulanet_plus_m" +MFR_MODEL = os.getenv('MINERU_MFR_MODEL', None) +if MFR_MODEL is None: + # MFR_MODEL = "unimernet_small" + MFR_MODEL = "pp_formulanet_plus_m" def img_orientation_cls_model_init(): diff --git a/mineru/model/mfr/pp_formulanet_plus_m/processors.py b/mineru/model/mfr/pp_formulanet_plus_m/processors.py index e4da634d..40024f31 100644 --- a/mineru/model/mfr/pp_formulanet_plus_m/processors.py +++ b/mineru/model/mfr/pp_formulanet_plus_m/processors.py @@ -6,9 +6,13 @@ import re from PIL import Image, ImageOps from typing import List, Optional, Tuple, Union, Dict, Any + +from loguru import logger from tokenizers import AddedToken from tokenizers import Tokenizer as TokenizerFast +from mineru.model.mfr.unimernet.unimernet_hf.modeling_unimernet import fix_latex_left_right + class UniMERNetImgDecode(object): """Class for decoding images for UniMERNet, including cropping margins, resizing, and padding.""" @@ -589,6 +593,7 @@ class UniMERNetDecode(object): replaced_formula = pattern.sub(replacer, formula) return replaced_formula.replace('"', "") + UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)') def post_process(self, text: str) -> str: """Post-processes a string by fixing text and normalizing it. @@ -602,6 +607,10 @@ class UniMERNetDecode(object): text = self.remove_chinese_text_wrapping(text) text = fix_text(text) + text = fix_latex_left_right(text) + text = self.UP_PATTERN.sub( + lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", text + ) text = self.normalize(text) return text diff --git a/mineru/utils/llm_aided.py b/mineru/utils/llm_aided.py index bdc8aba9..26285db0 100644 --- a/mineru/utils/llm_aided.py +++ b/mineru/utils/llm_aided.py @@ -51,7 +51,7 @@ def llm_aided_title(page_info_list, title_aided_config): 3. 保持字典内key-value的对应关系不变 4. 优化层次结构: - - 为每个标题元素添加适当的层次结构 + - 根据标题内容的语义为每个标题元素添加适当的层次结构 - 行高较大的标题一般是更高级别的标题 - 标题从前至后的层级必须是连续的,不能跳过层级 - 标题层级最多为4级,不要添加过多的层级 @@ -61,7 +61,6 @@ def llm_aided_title(page_info_list, title_aided_config): - 在完成初步分级后,仔细检查分级结果的合理性 - 根据上下文关系和逻辑顺序,对不合理的分级进行微调 - 确保最终的分级结果符合文档的实际结构和逻辑 - - 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们 IMPORTANT: 请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下: @@ -78,6 +77,8 @@ Input title list: Corrected title list: """ + #5. + #- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们 retry_count = 0 max_retries = 3 @@ -89,6 +90,7 @@ Corrected title list: model=title_aided_config["model"], messages=[ {'role': 'user', 'content': title_optimize_prompt}], + extra_body={"enable_thinking": False}, temperature=0.7, stream=True, )