Compare commits

...

71 Commits

Author SHA1 Message Date
Xiaomeng Zhao
91e943a1f1 Merge pull request #2405 from opendatalab/master
update version
2025-04-29 16:08:07 +08:00
myhloli
2aaf2310f2 Update version.py with new version 2025-04-29 08:06:04 +00:00
Xiaomeng Zhao
8802687934 Merge pull request #2404 from opendatalab/release-1.3.10
Release 1.3.10
2025-04-29 15:48:55 +08:00
Xiaomeng Zhao
2c2fcbe832 Merge pull request #2403 from myhloli/dev
feat(model_utils): adjust table detection threshold and add features
2025-04-29 15:27:44 +08:00
myhloli
9c37d65fab docs(README_zh-CN): update doc 2025-04-29 15:26:08 +08:00
myhloli
49a8f8be0a feat(model_utils): adjust table detection threshold and add features
- Adjust the threshold for considering tables inside other tables from2 to 3
- Add support for custom formula delimiters through user configuration
- Pin pdfminer.six to version 20250324 to prevent parsing failures
2025-04-29 15:24:28 +08:00
Xiaomeng Zhao
5e15d9b664 Merge pull request #2402 from myhloli/dev
build(deps): pin pdfminer.six version to 20250324
2025-04-29 14:56:21 +08:00
myhloli
81daf298b5 build(deps): pin pdfminer.six version to 20250324
- Update pdfminer.six dependency from >=20250416 to ==20250324
- This change ensures compatibility with specific project requirements
2025-04-29 14:55:07 +08:00
myhloli
2d4e9e544e Merge remote-tracking branch 'origin/dev' into dev 2025-04-29 10:54:34 +08:00
myhloli
dfd13fa2ab fix(mfr): add LaTeX symbol replacements for fint and up
- Add regex patterns for replacing LaTeX symbols \fint and \up with their Unicode equivalents
2025-04-29 10:53:40 +08:00
Xiaomeng Zhao
2cf55ce1d1 Merge pull request #2395 from myhloli/dev
feat(latex): enhance LaTeX delimiter support and configurability
2025-04-28 14:37:33 +08:00
myhloli
100e9c17a5 feat(latex): enhance LaTeX delimiter support and configurability
- Add support for \(\) and \[\] delimiters in addition to $$ and $$- Make LaTeX delimiter configuration more flexible and user-defined
- Update configuration file to include LaTeX delimiter settings
- Modify OCR content generation to use configurable delimiters
2025-04-28 14:35:39 +08:00
Xiaomeng Zhao
cf33cb882d Merge pull request #2389 from myhloli/dev
fix(mfr): add underscore symbol to unimernet
2025-04-28 01:56:17 +08:00
myhloli
98dd179053 Merge remote-tracking branch 'origin/dev' into dev 2025-04-28 01:55:20 +08:00
myhloli
7d77d614ec fix(mfr): add underscore symbol to unimernet
- Add \textunderscore to the list of LaTeX patterns
- This allows the model to properly render underscore characters
2025-04-28 01:54:29 +08:00
Xiaomeng Zhao
c060413b19 Merge pull request #2388 from opendatalab/master
update version
2025-04-27 18:30:05 +08:00
myhloli
1e715d026d Update version.py with new version 2025-04-27 10:23:03 +00:00
Xiaomeng Zhao
0d5762e57a Merge pull request #2381 from opendatalab/release-1.3.9
Release 1.3.9
2025-04-27 18:18:46 +08:00
Xiaomeng Zhao
d68fe15bde Merge pull request #2386 from opendatalab/dev
Dev
2025-04-27 18:18:28 +08:00
Xiaomeng Zhao
9bdc254456 Merge pull request #2385 from myhloli/dev
docs: correct typo for Apple Silicon in install guide and README
2025-04-27 18:18:01 +08:00
myhloli
ebb7df984e docs: correct typo for Apple Silicon in install guide and README
- Fix typo in install.rst and README_zh-CN.md
- Change 'apple slicon' to 'Apple silicon'
2025-04-27 18:16:46 +08:00
Xiaomeng Zhao
e54f8fd31e Merge pull request #2384 from opendatalab/dev
docs(README): fix typo
2025-04-27 18:14:46 +08:00
Xiaomeng Zhao
9f892a5e9d Merge pull request #2367 from kowyo/patch-1
docs(README): fix typo
2025-04-27 18:14:08 +08:00
Xiaomeng Zhao
623537dd9c Merge pull request #2383 from opendatalab/dev
update readme
2025-04-27 18:12:26 +08:00
Xiaomeng Zhao
c1fbf01c43 Merge pull request #2382 from myhloli/dev
feat(pdf): optimize formula parsing and update pdfminer.six
2025-04-27 18:11:47 +08:00
myhloli
0807e971fe feat(pdf): optimize formula parsing and update pdfminer.six
- Improve formula parsing success rate for better formula rendering
- Upgrade pdfminer.six to the latest version to fix PDF parsing issues- Update changelog in both English and Chinese README files
2025-04-27 18:10:02 +08:00
Xiaomeng Zhao
ef854b23aa Merge pull request #2380 from myhloli/dev
build(deps): update pdfminer.six to latest version
2025-04-27 17:38:42 +08:00
myhloli
2d1a0f2ca6 fix(mfr): optimize LaTeX formula repair functionality
- Improve \left and \right command handling in LaTeX formulas
- Enhance environment type matching for array, matrix, and other structures
- Refactor code for better readability and maintainability
2025-04-27 17:35:36 +08:00
myhloli
c8747cffb4 fix(magic_pdf): improve LaTeX formula processing and environment handling
- Refactor LaTeX left/right pair fixing logic for better balance
- Add environment detection and correction for common math environments
- Implement more robust whitespace handling and command substitution
- Optimize regex patterns for improved performance and readability
2025-04-27 17:10:15 +08:00
myhloli
0299dea199 build(deps): update pdfminer.six to latest version
- Change pdfminer.six dependency from ==20231228 to >=20250416
- This update ensures compatibility with the latest version of pdfminer.six
2025-04-27 16:38:34 +08:00
myhloli
2e91fb3f52 fix(mfr): improve LaTeX formula processing and repair
- Add functions to fix LaTeX left and right commands
- Implement brace matching and repair in LaTeX formulas
- Remove unnecessary whitespace and repair LaTeX code
- Replace specific LaTeX commands with appropriate alternatives
- Add logging for debugging purposes
2025-04-25 20:43:39 +08:00
myhloli
6c1511517a fix(mfr): improve LaTeX formula processing and repair
- Add functions to fix LaTeX left and right commands
- Implement brace matching and repair in LaTeX formulas
- Remove unnecessary whitespace and repair LaTeX code
- Replace specific LaTeX commands with appropriate alternatives
- Add logging for debugging purposes
2025-04-25 20:12:50 +08:00
github-actions[bot]
b864062a4f @kowyo has signed the CLA in opendatalab/MinerU#2367 2025-04-25 02:54:31 +00:00
小林在忙毕业设计
c1558af3ef docs(README): fix typo 2025-04-24 23:08:19 +08:00
Xiaomeng Zhao
2a9ac8939f Merge pull request #2365 from myhloli/dev
fix(mfr): improve LaTeX whitespace handling in unimernet model
2025-04-24 19:34:45 +08:00
myhloli
bfb80cb2e5 fix(mfr): improve LaTeX whitespace handling in unimernet model
- Preserve "\ " sequences during whitespace removal
- Add temporary substitution to prevent incorrect processing of "\ " sequences
- Restore "\ " sequences after removing unnecessary whitespace
2025-04-24 19:33:03 +08:00
Xiaomeng Zhao
80a80482f3 Merge pull request #2356 from opendatalab/master
master->dev
2025-04-23 18:50:42 +08:00
myhloli
a24b9ed8fd Merge remote-tracking branch 'origin/master' 2025-04-23 18:48:46 +08:00
myhloli
e0dc6c8473 docs(README): update changelog for version 1.3.8 release 2025-04-23 18:48:32 +08:00
myhloli
801d3ade19 Update version.py with new version 2025-04-23 10:41:07 +00:00
Xiaomeng Zhao
6b7a861e8f Merge pull request #2354 from opendatalab/release-1.3.8
Release 1.3.8
2025-04-23 18:38:42 +08:00
Xiaomeng Zhao
9fbaee9e89 Merge pull request #2353 from myhloli/dev
test(table): update test_rapidtable.py to handle SegLink text variations
2025-04-23 18:27:20 +08:00
myhloli
61fa95d4e0 test(table): update test_rapidtable.py to handle SegLink text variations
- Modify assertion for first cell text to check for 'SegLink' instead of exact match
- This change accommodates variations in SegLink text format
2025-04-23 18:26:19 +08:00
Xiaomeng Zhao
5c232f0587 Merge pull request #2352 from myhloli/dev
feat(ocr): add new Chinese OCR model and update language support
2025-04-23 18:15:25 +08:00
myhloli
45f5082613 refactor(ocr): update device parameter handling in paddleocr2pytorch
- Replace get_device() function call with direct 'device' variable usage
- Simplify device configuration in OCR model initialization
2025-04-23 18:13:58 +08:00
myhloli
4f88fcaa51 feat(ocr): add new Chinese OCR model and update language support
- Add new Chinese OCR model (ch_PP-OCRv4_rec_server_doc_infer) for server-side use
- Update language support in app.py to include new Chinese model
- Modify models_config.yml to add new model configuration
2025-04-23 18:06:12 +08:00
Xiaomeng Zhao
3cf1ea1f5b Merge pull request #2316 from opendatalab/master
master->dev
2025-04-22 19:28:21 +08:00
myhloli
d874563e38 Update version.py with new version 2025-04-22 11:27:25 +00:00
Xiaomeng Zhao
55fcb7387f Merge pull request #2315 from opendatalab/release-1.3.7
Release 1.3.7
2025-04-22 19:26:03 +08:00
Xiaomeng Zhao
f2169686e1 Merge pull request #2314 from myhloli/dev
refactor(table): replace ocr_engine with lang in table model prediction
2025-04-22 19:25:00 +08:00
myhloli
9c4e779b91 fix(lang|performance): resolve lang parameter issue and speed up OCR/table parsing
- Fix lang parameter ineffectiveness during table parsing model initialization
- Resolve significant slowdown in OCR and table parsing speed in CPU mode
- Update changelog in README.md and README_zh-CN.md
2025-04-22 19:15:29 +08:00
myhloli
8d9070db10 fix(lang|performance): resolve lang parameter issue and speed up OCR/table parsing
- Fix lang parameter ineffectiveness during table parsing model initialization
- Resolve significant slowdown in OCR and table parsing speed in CPU mode
- Update changelog in README.md and README_zh-CN.md
2025-04-22 19:15:01 +08:00
myhloli
69cdea908d fix(ocr): switch to ch_lite model for Chinese OCR on CPU
- Automatically change to ch_lite model when using CPU for Chinese OCR
- This modification improves performance on CPU devices
2025-04-22 19:12:35 +08:00
myhloli
1d1c7ba9ab refactor(table): replace ocr_engine with lang in table model prediction
- Remove OCR engine instantiation inside the loop
- Pass language directly to the table model instead of OCR engine
- Simplify code structure and improve readability
2025-04-22 18:55:10 +08:00
myhloli
4d5fd0ee55 Update version.py with new version 2025-04-21 06:45:36 +00:00
Xiaomeng Zhao
601b44bfe0 Merge pull request #2298 from opendatalab/release-1.3.6
Release 1.3.6
2025-04-21 14:37:23 +08:00
Xiaomeng Zhao
012327badb Merge pull request #2297 from myhloli/dev
feat: add support for JPEG images and update documentation
2025-04-21 14:26:35 +08:00
myhloli
fcb5660f6a feat: add support for JPEG images and update documentation
- Add '.jpeg' to the list of supported image extensions in app.py and read_api.py
- Update projects READMEs to indicate that web_demo is deprecated
2025-04-21 14:22:23 +08:00
myhloli
d105d87cf5 Merge remote-tracking branch 'origin/dev' into dev 2025-04-18 10:56:36 +08:00
myhloli
619b3b6d32 docs(README): update bug report template to reference Readme instead of Docs
- Update the bug report template to direct users to search the MinerU Readme instead of Docs
- This change ensures users check the most relevant and up-to-date information source before reporting issues
2025-04-18 10:56:26 +08:00
Xiaomeng Zhao
6fbbe3e6f0 Merge pull request #2274 from opendatalab/dev
docs: update issue templates and disable blank issues
2025-04-17 18:46:05 +08:00
Xiaomeng Zhao
a47b17cd88 Merge pull request #2273 from myhloli/dev
docs: update issue templates and disable blank issues
2025-04-17 18:45:26 +08:00
myhloli
737d7d6eb9 docs: update issue templates and disable blank issues
- Update bug report template with more detailed instructions and sections
- Add operating system version field to bug report
- Include support for MPS in device options
- Disable blank issues and provide alternative contact links
- Remove feature request template
2025-04-17 18:44:20 +08:00
Xiaomeng Zhao
3492744ce1 Merge pull request #2269 from dt-yy/dev
update test case
2025-04-17 15:23:19 +08:00
dt-yy
a1fe370270 update test case 2025-04-17 15:21:41 +08:00
dt-yy
fea756fd3e update test case 2025-04-17 14:34:54 +08:00
dt-yy
e98988920e update test case 2025-04-17 14:24:58 +08:00
github-actions[bot]
19fd2cfa37 @vloum has signed the CLA in opendatalab/MinerU#2267 2025-04-17 03:55:12 +00:00
Xiaomeng Zhao
74f9978e02 Merge pull request #2266 from opendatalab/master
master->dev
2025-04-17 11:42:23 +08:00
myhloli
0c9572c871 Update version.py with new version 2025-04-17 03:34:11 +00:00
Xiaomeng Zhao
8fb6794b95 Merge pull request #2265 from opendatalab/release-1.3.5
Release 1.3.5
2025-04-17 11:31:24 +08:00
27 changed files with 16148 additions and 124 deletions

View File

@@ -1,4 +1,4 @@
name: Bug Report | 反馈 Bug
name: 🐛 Bug Report
description: Create a bug report for MinerU | MinerU 的 Bug 反馈
labels: bug
@@ -6,14 +6,32 @@ labels: bug
# empty string, Github seems to reject this .yml file.
body:
- type: markdown
attributes:
value: |
Thank you for submitting a MinerU 🐛 Bug Report! | 感谢您提交 MinerU 🐛 Bug 反馈!
- type: checkboxes
attributes:
label: 🔎 Search before asking | 提交之前请先搜索
description: >
Please search the MinerU [Readme](https://github.com/opendatalab/MinerU), [Issues](https://github.com/opendatalab/MinerU/issues) and [Discussions](https://github.com/opendatalab/MinerU/discussions) to see if a similar bug report already exists.
options:
- label: I have searched the MinerU [Readme](https://github.com/opendatalab/MinerU) and found no similar bug report.
required: true
- label: I have searched the MinerU [Issues](https://github.com/opendatalab/MinerU/issues) and found no similar bug report.
required: true
- label: I have searched the MinerU [Discussions](https://github.com/opendatalab/MinerU/discussions) and found no similar bug report.
required: true
- type: textarea
id: description
attributes:
label: Description of the bug | 错误描述
description: |
A clear and concise description of the bug. | 简单描述遇到的问题
Provide console output with error messages and/or screenshots of the bug. | 请提供详细报错信息或者截图
placeholder: |
💡 ProTip! Include as much information as possible (screenshots, logs, tracebacks etc.) to receive the most helpful response.
validations:
required: true
@@ -24,11 +42,12 @@ body:
# Should not word-wrap this description here.
description: |
* Explain the steps required to reproduce the bug. | 说明复现此错误所需的步骤。
* Include required code snippets, example files, etc. | 包含必要的代码片段、示例文件等。
* Describe what you expected to happen (if not obvious). | 描述你期望发生的情况
* If applicable, add screenshots to help explain the problem. | 添加截图以帮助解释问题。
* Include any other information that could be relevant, for example information about the Python environment. | 包括任何其他可能相关的信息
If you have questions about the parsing results or encounter errors during execution: | 如对解析结果有疑问或在运行中出现报错等异常:
* Provide a minimal reproducible example. | 请提供一个最小可复现的demo。
* The demo should include the complete steps, code, and the PDF file to be parsed. | demo需要包含完整的操作步骤代码以及需要解析的PDF文件
* When reporting parsing result anomalies and runtime errors, reproducible PDF files are essential. If the document is too large or confidential, you can print the problematic page(s) via the browser and submit the corresponding example file.
* 在反馈解析结果异常和运行时报错时可复现的PDF文件是必不可少的如文档过大或涉密您可通过浏览器打印出出现问题的某一页或某几页再提交相应的示例文件
For problems when building or installing MinerU: | 在构建或安装 MinerU 时遇到的问题:
* Give the **exact** build/install commands that were run. | 提供**确切**的构建/安装命令。
@@ -44,9 +63,9 @@ body:
- type: dropdown
id: os_name
id: os_mode
attributes:
label: Operating system | 操作系统
label: Operating System Mode | 操作系统类型
#multiple: true
options:
-
@@ -56,6 +75,22 @@ body:
validations:
required: true
- type: textarea
id: os_name_version
attributes:
label: Operating System Version| 操作系统版本
#multiple: true
description: |
* 如果您使用的是Linux系统请提供Linux系统的**发行版名称**和**版本号**来帮助开发人员排查问题。
* If you are using a Linux system, please provide the Linux distribution and version number to help developers troubleshoot the issue.
* 如果您使用的是Windows或MacOS系统请提供操作系统的**版本号**来帮助开发人员排查问题。
* If you are using a Windows or MacOS system, please provide the version number of the operating system to help developers troubleshoot the issue.
* 例如Ubuntu 22.04, CentOS 7.9, MacOS 15.1, Windows 11
* For example: Ubuntu 22.04, CentOS 7.9, MacOS 15.1, Windows 11.
validations:
required: true
- type: dropdown
id: python_version
attributes:
@@ -94,6 +129,7 @@ body:
-
- cpu
- cuda
- mps
- npu
validations:
required: true

11
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View File

@@ -0,0 +1,11 @@
blank_issues_enabled: false
contact_links:
- name: 🙏 Q&A
url: https://github.com/opendatalab/MinerU/discussions/categories/q-a
about: Ask the community for help
- name: 💡 Feature requests and ideas
url: https://github.com/opendatalab/MinerU/discussions/categories/ideas
about: Share ideas for new features
- name: 🙌 Show and tell
url: https://github.com/opendatalab/MinerU/discussions/categories/show-and-tell
about: Show off something you've made

View File

@@ -1,28 +0,0 @@
---
name: Feature request | 功能需求
about: Suggest an idea for this project | 提出一个有价值的idea
title: ''
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
**您的特性请求是否与某个问题相关?请描述。**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
对存在的问题进行清晰且简洁的描述。例如:我一直很困扰的是 [...]
**Describe the solution you'd like**
**描述您期望的解决方案**
A clear and concise description of what you want to happen.
清晰且简洁地描述您希望实现的内容。
**Describe alternatives you've considered**
**描述您已考虑的替代方案**
A clear and concise description of any alternative solutions or features you've considered.
清晰且简洁地描述您已经考虑过的任何替代解决方案。
**Additional context**
**提供更多细节**
Add any other context or screenshots about the feature request here.
请附上任何相关截图、链接或文件,以帮助我们更好地理解您的请求。

View File

@@ -48,6 +48,21 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
</div>
# Changelog
- 2025/04/29 1.3.10 Released
- Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
- Pinned `pdfminer.six` to version `20250324` to prevent parsing failures caused by new versions.
- 2025/04/27 1.3.9 Released
- Optimized the formula parsing function to improve the success rate of formula rendering
- Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues
- 2025/04/23 1.3.8 Released
- The default `ocr` model (`ch`) has been updated to `PP-OCRv4_server_rec_doc` (model update required)
- `PP-OCRv4_server_rec_doc` is trained on a mix of more Chinese document data and PP-OCR training data, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters. It supports over 15,000 recognizable characters, improving text recognition in documents while also boosting general text recognition.
- [Performance comparison between PP-OCRv4_server_rec_doc, PP-OCRv4_server_rec, and PP-OCRv4_mobile_rec](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/text_recognition.html#ii-supported-model-list)
- Verified results show that the `PP-OCRv4_server_rec_doc` model significantly improves accuracy in both single-language (`Chinese`, `English`, `Japanese`, `Traditional Chinese`) and mixed-language scenarios, with speed comparable to `PP-OCRv4_server_rec`, making it suitable for most use cases.
- In a small number of pure English scenarios, the `PP-OCRv4_server_rec_doc` model may encounter word concatenation issues, whereas `PP-OCRv4_server_rec` performs better in such cases. Therefore, we have retained the `PP-OCRv4_server_rec` model, which users can invoke by passing the parameter `lang='ch_server'`(python api) or `--lang ch_server`(cli).
- 2025/04/22 1.3.7 Released
- Fixed the issue where the `lang` parameter was ineffective during table parsing model initialization.
- Fixed the significant slowdown in OCR and table parsing speed in `cpu` mode.
- 2025/04/16 1.3.4 Released
- Slightly improved the speed of OCR detection by removing some unused blocks.
- Fixed page-level sorting errors caused by footnotes in certain cases.
@@ -365,7 +380,7 @@ There are three different ways to experience MinerU:
<td colspan="2">GPU VRAM 6GB or more</td>
<td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
More than 6GB VRAM </td>
<td rowspan="2">apple slicon</td>
<td rowspan="2">Apple silicon</td>
</tr>
</table>

View File

@@ -47,6 +47,21 @@
</div>
# 更新记录
- 2025/04/29 1.3.10 发布
- 支持使用自定义公式标识符,可通过修改用户目录下的`magic-pdf.json`文件中的`latex-delimiter-config`项实现。
- 锁定`pdfminer.six``20250324`版本,以避免新版本导致的解析失败问题。
- 2025/04/27 1.3.9 发布
- 优化公式解析功能,提升公式渲染的成功率
- 更新`pdfminer.six`到最新版本修复了部分pdf解析异常问题
- 2025/04/23 1.3.8 发布
- `ocr`默认模型(`ch`)更新为`PP-OCRv4_server_rec_doc`(需更新模型)
- `PP-OCRv4_server_rec_doc`是在`PP-OCRv4_server_rec`的基础上在更多中文文档数据和PP-OCR训练数据的混合数据训练而成增加了部分繁体字、日文、特殊字符的识别能力可支持识别的字符为1.5万+,除文档相关的文字识别能力提升外,也同时提升了通用文字的识别能力。
- [PP-OCRv4_server_rec_doc/PP-OCRv4_server_rec/PP-OCRv4_mobile_rec 性能对比](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/text_recognition.html#_3)
- 经验证,`PP-OCRv4_server_rec_doc`模型在`中英日繁`单种语言或多种语言混合场景均有明显精度提升,且速度与`PP-OCRv4_server_rec`相当,适合绝大部分场景使用。
- `PP-OCRv4_server_rec_doc`在小部分纯英文场景可能会发生单词粘连问题,`PP-OCRv4_server_rec`则在此场景下表现更好,因此我们保留了`PP-OCRv4_server_rec`模型,用户可通过增加参数`lang='ch_server'`(python api)或`--lang ch_server`(命令行)调用。
- 2025/04/22 1.3.7 发布
- 修复表格解析模型初始化时lang参数失效的问题
- 修复在`cpu`模式下ocr和表格解析速度大幅下降的问题
- 2025/04/16 1.3.4 发布
- 通过移除一些无用的块小幅提升了ocr-det的速度
- 修复部分情况下由footnote导致的页面内排序错误
@@ -355,7 +370,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
<td colspan="2">
Volta(2017)及之后生产的全部带Tensor Core的GPU <br>
6G显存及以上</td>
<td rowspan="2">apple slicon</td>
<td rowspan="2">Apple silicon</td>
</tr>
</table>

View File

@@ -20,6 +20,16 @@
"enable": true,
"max_time": 400
},
"latex-delimiter-config": {
"display": {
"left": "$$",
"right": "$$"
},
"inline": {
"left": "$",
"right": "$"
}
},
"llm-aided-config": {
"formula_aided": {
"api_key": "your_api_key",
@@ -40,5 +50,5 @@
"enable": false
}
},
"config_version": "1.2.0"
"config_version": "1.2.1"
}

View File

@@ -116,7 +116,7 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
shutil.rmtree(temp_dir)
return ret
def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg', '.jpeg']) -> list[ImageDataset]:
"""Read images from path or directory.
Args:

View File

@@ -5,6 +5,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.config_reader import get_latex_delimiter_config
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.post_proc.para_split_v3 import ListLineTag
@@ -145,6 +146,19 @@ def full_to_half(text: str) -> str:
result.append(char)
return ''.join(result)
latex_delimiters_config = get_latex_delimiter_config()
default_delimiters = {
'display': {'left': '$$', 'right': '$$'},
'inline': {'left': '$', 'right': '$'}
}
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
display_left_delimiter = delimiters['display']['left']
display_right_delimiter = delimiters['display']['right']
inline_left_delimiter = delimiters['inline']['left']
inline_right_delimiter = delimiters['inline']['right']
def merge_para_with_text(para_block):
block_text = ''
@@ -168,9 +182,9 @@ def merge_para_with_text(para_block):
if span_type == ContentType.Text:
content = ocr_escape_special_markdown_char(span['content'])
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
content = content.strip()

View File

@@ -125,6 +125,15 @@ def get_llm_aided_config():
else:
return llm_aided_config
def get_latex_delimiter_config():
config = read_config()
latex_delimiter_config = config.get('latex-delimiter-config')
if latex_delimiter_config is None:
logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
return None
else:
return latex_delimiter_config
if __name__ == '__main__':
ak, sk, endpoint = get_s3_config('llm-raw')

View File

@@ -1 +1 @@
__version__ = "1.3.4"
__version__ = "1.3.10"

View File

@@ -161,20 +161,13 @@ class BatchAnalyze:
for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"):
_lang = table_res_dict['lang']
atom_model_manager = AtomModelSingleton()
ocr_engine = atom_model_manager.get_atom_model(
atom_model_name='ocr',
ocr_show_log=False,
det_db_box_thresh=0.5,
det_db_unclip_ratio=1.6,
lang=_lang
)
table_model = atom_model_manager.get_atom_model(
atom_model_name='table',
table_model_name='rapid_table',
table_model_path='',
table_max_time=400,
device='cpu',
ocr_engine=ocr_engine,
lang=_lang,
table_sub_model_name='slanet_plus'
)
html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(table_res_dict['table_img'])

View File

@@ -5,6 +5,7 @@ from typing import Optional
import torch
from ftfy import fix_text
from loguru import logger
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
@@ -57,22 +58,319 @@ class TokenizerWrapper:
return toks
def latex_rm_whitespace(s: str):
"""Remove unnecessary whitespace from LaTeX code.
LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
def fix_latex_left_right(s):
"""
text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
letter = r'[a-zA-Z]'
noletter = r'[\W_^\d]'
names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
s = re.sub(text_reg, lambda _: str(names.pop(0)), s)
news = s
while True:
s = news
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
if news == s:
break
修复LaTeX中的\left和\right命令
1. 确保它们后面跟有效分隔符
2. 平衡\left和\right的数量
"""
# 白名单分隔符
valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
r'\Uparrow', r'\Downarrow', r'\|', r'\.']
# 为\left后缺失有效分隔符的情况添加点
def fix_delim(match, is_left=True):
cmd = match.group(1) # \left 或 \right
rest = match.group(2) if len(match.groups()) > 1 else ""
if not rest or rest not in valid_delims_list:
return cmd + "."
return match.group(0)
# 使用更精确的模式匹配\left和\right命令
# 确保它们是独立的命令,不是其他命令的一部分
# 使用预编译正则和统一回调函数
s = LEFT_PATTERN.sub(lambda m: fix_delim(m, True), s)
s = RIGHT_PATTERN.sub(lambda m: fix_delim(m, False), s)
# 更精确地计算\left和\right的数量
left_count = len(LEFT_COUNT_PATTERN.findall(s)) # 不匹配\lefteqn等
right_count = len(RIGHT_COUNT_PATTERN.findall(s)) # 不匹配\rightarrow等
if left_count == right_count:
# 如果数量相等,检查是否在同一组
return fix_left_right_pairs(s)
else:
# 如果数量不等,移除所有\left和\right
# logger.debug(f"latex:{s}")
# logger.warning(f"left_count: {left_count}, right_count: {right_count}")
return LEFT_RIGHT_REMOVE_PATTERN.sub('', s)
def fix_left_right_pairs(latex_formula):
"""
检测并修复LaTeX公式中\left和\right不在同一组的情况
Args:
latex_formula (str): 输入的LaTeX公式
Returns:
str: 修复后的LaTeX公式
"""
# 用于跟踪花括号嵌套层级
brace_stack = []
# 用于存储\left信息: (位置, 深度, 分隔符)
left_stack = []
# 存储需要调整的\right信息: (开始位置, 结束位置, 目标位置)
adjustments = []
i = 0
while i < len(latex_formula):
# 检查是否是转义字符
if i > 0 and latex_formula[i - 1] == '\\':
backslash_count = 0
j = i - 1
while j >= 0 and latex_formula[j] == '\\':
backslash_count += 1
j -= 1
if backslash_count % 2 == 1:
i += 1
continue
# 检测\left命令
if i + 5 < len(latex_formula) and latex_formula[i:i + 5] == "\\left" and i + 5 < len(latex_formula):
delimiter = latex_formula[i + 5]
left_stack.append((i, len(brace_stack), delimiter))
i += 6 # 跳过\left和分隔符
continue
# 检测\right命令
elif i + 6 < len(latex_formula) and latex_formula[i:i + 6] == "\\right" and i + 6 < len(latex_formula):
delimiter = latex_formula[i + 6]
if left_stack:
left_pos, left_depth, left_delim = left_stack.pop()
# 如果\left和\right不在同一花括号深度
if left_depth != len(brace_stack):
# 找到\left所在花括号组的结束位置
target_pos = find_group_end(latex_formula, left_pos, left_depth)
if target_pos != -1:
# 记录需要移动的\right
adjustments.append((i, i + 7, target_pos))
i += 7 # 跳过\right和分隔符
continue
# 处理花括号
if latex_formula[i] == '{':
brace_stack.append(i)
elif latex_formula[i] == '}':
if brace_stack:
brace_stack.pop()
i += 1
# 应用调整,从后向前处理以避免索引变化
if not adjustments:
return latex_formula
result = list(latex_formula)
adjustments.sort(reverse=True, key=lambda x: x[0])
for start, end, target in adjustments:
# 提取\right部分
right_part = result[start:end]
# 从原位置删除
del result[start:end]
# 在目标位置插入
result.insert(target, ''.join(right_part))
return ''.join(result)
def find_group_end(text, pos, depth):
"""查找特定深度的花括号组的结束位置"""
current_depth = depth
i = pos
while i < len(text):
if text[i] == '{' and (i == 0 or not is_escaped(text, i)):
current_depth += 1
elif text[i] == '}' and (i == 0 or not is_escaped(text, i)):
current_depth -= 1
if current_depth < depth:
return i
i += 1
return -1 # 未找到对应结束位置
def is_escaped(text, pos):
"""检查字符是否被转义"""
backslash_count = 0
j = pos - 1
while j >= 0 and text[j] == '\\':
backslash_count += 1
j -= 1
return backslash_count % 2 == 1
def fix_unbalanced_braces(latex_formula):
"""
检测LaTeX公式中的花括号是否闭合并删除无法配对的花括号
Args:
latex_formula (str): 输入的LaTeX公式
Returns:
str: 删除无法配对的花括号后的LaTeX公式
"""
stack = [] # 存储左括号的索引
unmatched = set() # 存储不匹配括号的索引
i = 0
while i < len(latex_formula):
# 检查是否是转义的花括号
if latex_formula[i] in ['{', '}']:
# 计算前面连续的反斜杠数量
backslash_count = 0
j = i - 1
while j >= 0 and latex_formula[j] == '\\':
backslash_count += 1
j -= 1
# 如果前面有奇数个反斜杠,则该花括号是转义的,不参与匹配
if backslash_count % 2 == 1:
i += 1
continue
# 否则,该花括号参与匹配
if latex_formula[i] == '{':
stack.append(i)
else: # latex_formula[i] == '}'
if stack: # 有对应的左括号
stack.pop()
else: # 没有对应的左括号
unmatched.add(i)
i += 1
# 所有未匹配的左括号
unmatched.update(stack)
# 构建新字符串,删除不匹配的括号
return ''.join(char for i, char in enumerate(latex_formula) if i not in unmatched)
def process_latex(input_string):
"""
处理LaTeX公式中的反斜杠
1. 如果\后跟特殊字符(#$%&~_^\\{})或空格,保持不变
2. 如果\后跟两个小写字母,保持不变
3. 其他情况,在\后添加空格
Args:
input_string (str): 输入的LaTeX公式
Returns:
str: 处理后的LaTeX公式
"""
def replace_func(match):
# 获取\后面的字符
next_char = match.group(1)
# 如果是特殊字符或空格,保持不变
if next_char in "#$%&~_^|\\{} \t\n\r\v\f":
return match.group(0)
# 如果是字母,检查下一个字符
if 'a' <= next_char <= 'z' or 'A' <= next_char <= 'Z':
pos = match.start() + 2 # \x后的位置
if pos < len(input_string) and ('a' <= input_string[pos] <= 'z' or 'A' <= input_string[pos] <= 'Z'):
# 下一个字符也是字母,保持不变
return match.group(0)
# 其他情况,在\后添加空格
return '\\' + ' ' + next_char
# 匹配\后面跟一个字符的情况
pattern = r'\\(.)'
return re.sub(pattern, replace_func, input_string)
# 常见的在KaTeX/MathJax中可用的数学环境
ENV_TYPES = ['array', 'matrix', 'pmatrix', 'bmatrix', 'vmatrix',
'Bmatrix', 'Vmatrix', 'cases', 'aligned', 'gathered']
ENV_BEGIN_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}') for env in ENV_TYPES}
ENV_END_PATTERNS = {env: re.compile(r'\\end\{' + env + r'\}') for env in ENV_TYPES}
ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') for env in ENV_TYPES}
def fix_latex_environments(s):
"""
检测LaTeX中环境如array\begin和\end是否匹配
1. 如果缺少\begin标签则在开头添加
2. 如果缺少\end标签则在末尾添加
"""
for env in ENV_TYPES:
begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
end_count = len(ENV_END_PATTERNS[env].findall(s))
if begin_count != end_count:
if end_count > begin_count:
format_match = ENV_FORMAT_PATTERNS[env].search(s)
default_format = '{c}' if env == 'array' else ''
format_str = '{' + format_match.group(1) + '}' if format_match else default_format
missing_count = end_count - begin_count
begin_command = '\\begin{' + env + '}' + format_str + ' '
s = begin_command * missing_count + s
else:
missing_count = begin_count - end_count
s = s + (' \\end{' + env + '}') * missing_count
return s
UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
COMMANDS_TO_REMOVE_PATTERN = re.compile(
r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph)')
REPLACEMENTS_PATTERNS = {
re.compile(r'\\underbar'): r'\\underline',
re.compile(r'\\Bar'): r'\\hat',
re.compile(r'\\Hat'): r'\\hat',
re.compile(r'\\Tilde'): r'\\tilde',
re.compile(r'\\slash'): r'/',
re.compile(r'\\textperthousand'): r'',
re.compile(r'\\sun'): r'',
re.compile(r'\\textunderscore'): r'\\_',
re.compile(r'\\fint'): r'',
re.compile(r'\\up '): r'\\ ',
}
QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
def latex_rm_whitespace(s: str):
"""Remove unnecessary whitespace from LaTeX code."""
s = fix_unbalanced_braces(s)
s = fix_latex_left_right(s)
s = fix_latex_environments(s)
# 使用预编译的正则表达式
s = UP_PATTERN.sub(
lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", s
)
s = COMMANDS_TO_REMOVE_PATTERN.sub('', s)
# 应用所有替换
for pattern, replacement in REPLACEMENTS_PATTERNS.items():
s = pattern.sub(replacement, s)
# 处理LaTeX中的反斜杠和空格
s = process_latex(s)
# \qquad后补空格
s = QQUAD_PATTERN.sub(r'\\qquad ', s)
return s

View File

@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
tables_inside = [j for j in range(len(table_res_list))
if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
# Continue if there are at least 2 tables inside
if len(tables_inside) >= 2:
# Continue if there are at least 3 tables inside
if len(tables_inside) >= 3:
# Check if inside tables overlap with each other
tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
for idx1 in range(len(tables_inside))

View File

@@ -53,6 +53,12 @@ class PytorchPaddleOCR(TextSystem):
args = parser.parse_args(args)
self.lang = kwargs.get('lang', 'ch')
device = get_device()
if device == 'cpu' and self.lang in ['ch', 'ch_server']:
logger.warning("The current device in use is CPU. To ensure the speed of parsing, the language is automatically switched to ch_lite.")
self.lang = 'ch_lite'
if self.lang in latin_lang:
self.lang = 'latin'
elif self.lang in arabic_lang:
@@ -74,7 +80,7 @@ class PytorchPaddleOCR(TextSystem):
kwargs['rec_char_dict_path'] = os.path.join(root_dir, 'pytorchocr', 'utils', 'resources', 'dict', dict_file)
# kwargs['rec_batch_num'] = 8
kwargs['device'] = get_device()
kwargs['device'] = device
default_args = vars(args)
default_args.update(kwargs)

View File

@@ -171,6 +171,31 @@ ch_PP-OCRv4_rec_server_infer:
nrtr_dim: 384
max_text_length: 25
ch_PP-OCRv4_rec_server_doc_infer:
model_type: rec
algorithm: SVTR_HGNet
Transform:
Backbone:
name: PPHGNet_small
Head:
name: MultiHead
out_channels_list:
CTCLabelDecode: 15631
head_list:
- CTCHead:
Neck:
name: svtr
dims: 120
depth: 2
hidden_dims: 120
kernel_size: [ 1, 3 ]
use_guide: True
Head:
fc_decay: 0.00001
- NRTRHead:
nrtr_dim: 384
max_text_length: 25
chinese_cht_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR

View File

@@ -3,10 +3,14 @@ lang:
det: ch_PP-OCRv3_det_infer.pth
rec: ch_PP-OCRv4_rec_infer.pth
dict: ppocr_keys_v1.txt
ch:
ch_server:
det: ch_PP-OCRv3_det_infer.pth
rec: ch_PP-OCRv4_rec_server_infer.pth
dict: ppocr_keys_v1.txt
ch:
det: ch_PP-OCRv3_det_infer.pth
rec: ch_PP-OCRv4_rec_server_doc_infer.pth
dict: ppocrv4_doc_dict.txt
en:
det: en_PP-OCRv3_det_infer.pth
rec: en_PP-OCRv4_rec_infer.pth

View File

@@ -86,7 +86,7 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
<td colspan="2">GPU VRAM 6GB or more</td>
<td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
More than 6GB VRAM </td>
<td rowspan="2">apple slicon</td>
<td rowspan="2">Apple silicon</td>
</tr>
</table>

View File

@@ -4,6 +4,6 @@
- [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index
- [gradio_app](./gradio_app/README.md): Build a web app based on gradio
- [web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version
- ~~[web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version~~(Deprecated)
- [web_api](./web_api/README.md): Web API Based on FastAPI
- [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe

View File

@@ -4,6 +4,6 @@
- [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统
- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
- [web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本
- ~~[web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本~~(已过时)
- [web_api](./web_api/README.md): 基于 FastAPI 的 Web API
- [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理

View File

@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
return md_content, txt_content, archive_zip_path, new_pdf_path
latex_delimiters = [{'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False}]
latex_delimiters = [
{'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False},
{'left': '\\(', 'right': '\\)', 'display': False},
{'left': '\\[', 'right': '\\]', 'display': True},
]
def init_model():
@@ -158,7 +162,7 @@ devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126
'sa', 'bgc'
]
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
add_lang = ['latin', 'arabic', 'cyrillic', 'devanagari']
# all_lang = ['', 'auto']
@@ -218,7 +222,8 @@ if __name__ == '__main__':
with gr.Tabs():
with gr.Tab('Markdown rendering'):
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
latex_delimiters=latex_delimiters, line_breaks=True)
latex_delimiters=latex_delimiters,
line_breaks=True)
with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True)
file.change(fn=to_pdf, inputs=file, outputs=pdf_show)

View File

@@ -28,7 +28,7 @@ app = FastAPI()
pdf_extensions = [".pdf"]
office_extensions = [".ppt", ".pptx", ".doc", ".docx"]
image_extensions = [".png", ".jpg"]
image_extensions = [".png", ".jpg", ".jpeg"]
class MemoryDataWriter(DataWriter):
def __init__(self):
@@ -128,7 +128,7 @@ def process_file(
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds = Union[PymuDocDataset, ImageDataset]
ds: Union[PymuDocDataset, ImageDataset] = None
if file_extension in pdf_extensions:
ds = PymuDocDataset(file_bytes)
elif file_extension in office_extensions:

View File

@@ -10,6 +10,6 @@ scikit-learn>=1.0.2
torch>=2.2.2,!=2.5.0,!=2.5.1
torchvision
transformers>=4.49.0,!=4.51.0,<5.0.0
pdfminer.six==20231228
pdfminer.six==20250324
tqdm>=4.67.1
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.

View File

@@ -239,6 +239,22 @@
"created_at": "2025-04-14T10:40:54Z",
"repoId": 765083837,
"pullRequestNo": 2226
},
{
"name": "vloum",
"id": 75369577,
"comment_id": 2811669681,
"created_at": "2025-04-17T03:54:59Z",
"repoId": 765083837,
"pullRequestNo": 2267
},
{
"name": "kowyo",
"id": 110339237,
"comment_id": 2829263082,
"created_at": "2025-04-25T02:54:20Z",
"repoId": 765083837,
"pullRequestNo": 2367
}
]
}

View File

@@ -323,44 +323,6 @@ class TestCli:
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_local_magic_pdf_open_st_table(self):
"""magic pdf cli open st table."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json"
value = {
"model": "struct_eqtable",
"enable": True,
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
@pytest.mark.P1
def test_local_magic_pdf_open_tablemaster_cuda(self):
"""magic pdf cli open table master html table cuda mode."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "tablemaster",
"enable": True,
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
@pytest.mark.P1
def test_local_magic_pdf_open_rapidai_table(self):
"""magic pdf cli open rapid ai table."""
@@ -370,6 +332,7 @@ class TestCli:
value = {
"model": "rapid_table",
"enable": True,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
@@ -397,6 +360,7 @@ class TestCli:
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.skip(reason="layoutlmv3废弃")
@pytest.mark.P1
def test_local_magic_pdf_layoutlmv3_yolo(self):
"""magic pdf cli open layoutlmv3."""
@@ -419,8 +383,9 @@ class TestCli:
#pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "tablemaster",
"enable": False,
"model": "rapid_table",
"enable": True,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
@@ -439,8 +404,9 @@ class TestCli:
#pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "tablemaster",
"model": "rapid_table",
"enable": False,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)

View File

@@ -41,7 +41,7 @@ class TestppTableModel(unittest.TestCase):
# 检查第一行数据
first_row = tree.xpath('//table/tr[2]/td')
assert len(first_row) == 5, "First row should have 5 cells"
assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'"
assert first_row[0].text and 'SegLink' in first_row[0].text.strip(), "First cell should be 'SegLink [26]'"
assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'"
assert first_row[2].text and first_row[2].text.strip() == "86.0", "Third cell should be '86.0'"
assert first_row[3].text and first_row[3].text.strip() == "77.0", "Fourth cell should be '77.0'"