Compare commits

...

16 Commits

Author SHA1 Message Date
Xiaomeng Zhao
91e943a1f1 Merge pull request #2405 from opendatalab/master
update version
2025-04-29 16:08:07 +08:00
myhloli
2aaf2310f2 Update version.py with new version 2025-04-29 08:06:04 +00:00
Xiaomeng Zhao
8802687934 Merge pull request #2404 from opendatalab/release-1.3.10
Release 1.3.10
2025-04-29 15:48:55 +08:00
Xiaomeng Zhao
2c2fcbe832 Merge pull request #2403 from myhloli/dev
feat(model_utils): adjust table detection threshold and add features
2025-04-29 15:27:44 +08:00
myhloli
9c37d65fab docs(README_zh-CN): update doc 2025-04-29 15:26:08 +08:00
myhloli
49a8f8be0a feat(model_utils): adjust table detection threshold and add features
- Adjust the threshold for considering tables inside other tables from2 to 3
- Add support for custom formula delimiters through user configuration
- Pin pdfminer.six to version 20250324 to prevent parsing failures
2025-04-29 15:24:28 +08:00
Xiaomeng Zhao
5e15d9b664 Merge pull request #2402 from myhloli/dev
build(deps): pin pdfminer.six version to 20250324
2025-04-29 14:56:21 +08:00
myhloli
81daf298b5 build(deps): pin pdfminer.six version to 20250324
- Update pdfminer.six dependency from >=20250416 to ==20250324
- This change ensures compatibility with specific project requirements
2025-04-29 14:55:07 +08:00
myhloli
2d4e9e544e Merge remote-tracking branch 'origin/dev' into dev 2025-04-29 10:54:34 +08:00
myhloli
dfd13fa2ab fix(mfr): add LaTeX symbol replacements for fint and up
- Add regex patterns for replacing LaTeX symbols \fint and \up with their Unicode equivalents
2025-04-29 10:53:40 +08:00
Xiaomeng Zhao
2cf55ce1d1 Merge pull request #2395 from myhloli/dev
feat(latex): enhance LaTeX delimiter support and configurability
2025-04-28 14:37:33 +08:00
myhloli
100e9c17a5 feat(latex): enhance LaTeX delimiter support and configurability
- Add support for \(\) and \[\] delimiters in addition to $$ and $$- Make LaTeX delimiter configuration more flexible and user-defined
- Update configuration file to include LaTeX delimiter settings
- Modify OCR content generation to use configurable delimiters
2025-04-28 14:35:39 +08:00
Xiaomeng Zhao
cf33cb882d Merge pull request #2389 from myhloli/dev
fix(mfr): add underscore symbol to unimernet
2025-04-28 01:56:17 +08:00
myhloli
98dd179053 Merge remote-tracking branch 'origin/dev' into dev 2025-04-28 01:55:20 +08:00
myhloli
7d77d614ec fix(mfr): add underscore symbol to unimernet
- Add \textunderscore to the list of LaTeX patterns
- This allows the model to properly render underscore characters
2025-04-28 01:54:29 +08:00
Xiaomeng Zhao
c060413b19 Merge pull request #2388 from opendatalab/master
update version
2025-04-27 18:30:05 +08:00
10 changed files with 58 additions and 11 deletions

View File

@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
</div>
# Changelog
- 2025/04/29 1.3.10 Released
- Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
- Pinned `pdfminer.six` to version `20250324` to prevent parsing failures caused by new versions.
- 2025/04/27 1.3.9 Released
- Optimized the formula parsing function to improve the success rate of formula rendering
- Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues

View File

@@ -47,6 +47,9 @@
</div>
# 更新记录
- 2025/04/29 1.3.10 发布
- 支持使用自定义公式标识符,可通过修改用户目录下的`magic-pdf.json`文件中的`latex-delimiter-config`项实现。
- 锁定`pdfminer.six``20250324`版本,以避免新版本导致的解析失败问题。
- 2025/04/27 1.3.9 发布
- 优化公式解析功能,提升公式渲染的成功率
- 更新`pdfminer.six`到最新版本修复了部分pdf解析异常问题

View File

@@ -20,6 +20,16 @@
"enable": true,
"max_time": 400
},
"latex-delimiter-config": {
"display": {
"left": "$$",
"right": "$$"
},
"inline": {
"left": "$",
"right": "$"
}
},
"llm-aided-config": {
"formula_aided": {
"api_key": "your_api_key",
@@ -40,5 +50,5 @@
"enable": false
}
},
"config_version": "1.2.0"
"config_version": "1.2.1"
}

View File

@@ -5,6 +5,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.config_reader import get_latex_delimiter_config
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.post_proc.para_split_v3 import ListLineTag
@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
result.append(char)
return ''.join(result)
latex_delimiters_config = get_latex_delimiter_config()
default_delimiters = {
'display': {'left': '$$', 'right': '$$'},
'inline': {'left': '$', 'right': '$'}
}
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
display_left_delimiter = delimiters['display']['left']
display_right_delimiter = delimiters['display']['right']
inline_left_delimiter = delimiters['inline']['left']
inline_right_delimiter = delimiters['inline']['right']
def merge_para_with_text(para_block):
block_text = ''
@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
if span_type == ContentType.Text:
content = ocr_escape_special_markdown_char(span['content'])
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
content = content.strip()

View File

@@ -125,6 +125,15 @@ def get_llm_aided_config():
else:
return llm_aided_config
def get_latex_delimiter_config():
config = read_config()
latex_delimiter_config = config.get('latex-delimiter-config')
if latex_delimiter_config is None:
logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
return None
else:
return latex_delimiter_config
if __name__ == '__main__':
ak, sk, endpoint = get_s3_config('llm-raw')

View File

@@ -1 +1 @@
__version__ = "1.3.9"
__version__ = "1.3.10"

View File

@@ -342,7 +342,10 @@ REPLACEMENTS_PATTERNS = {
re.compile(r'\\Tilde'): r'\\tilde',
re.compile(r'\\slash'): r'/',
re.compile(r'\\textperthousand'): r'',
re.compile(r'\\sun'): r''
re.compile(r'\\sun'): r'',
re.compile(r'\\textunderscore'): r'\\_',
re.compile(r'\\fint'): r'',
re.compile(r'\\up '): r'\\ ',
}
QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')

View File

@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
tables_inside = [j for j in range(len(table_res_list))
if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
# Continue if there are at least 2 tables inside
if len(tables_inside) >= 2:
# Continue if there are at least 3 tables inside
if len(tables_inside) >= 3:
# Check if inside tables overlap with each other
tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
for idx1 in range(len(tables_inside))

View File

@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
return md_content, txt_content, archive_zip_path, new_pdf_path
latex_delimiters = [{'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False}]
latex_delimiters = [
{'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False},
{'left': '\\(', 'right': '\\)', 'display': False},
{'left': '\\[', 'right': '\\]', 'display': True},
]
def init_model():
@@ -218,7 +222,8 @@ if __name__ == '__main__':
with gr.Tabs():
with gr.Tab('Markdown rendering'):
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
latex_delimiters=latex_delimiters, line_breaks=True)
latex_delimiters=latex_delimiters,
line_breaks=True)
with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True)
file.change(fn=to_pdf, inputs=file, outputs=pdf_show)

View File

@@ -10,6 +10,6 @@ scikit-learn>=1.0.2
torch>=2.2.2,!=2.5.0,!=2.5.1
torchvision
transformers>=4.49.0,!=4.51.0,<5.0.0
pdfminer.six>=20250416
pdfminer.six==20250324
tqdm>=4.67.1
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.