Compare commits

...

86 Commits

Author SHA1 Message Date
Xiaomeng Zhao
a989444e2f Merge pull request #2514 from opendatalab/release-1.3.12
Release 1.3.12
2025-05-24 16:02:43 +08:00
Xiaomeng Zhao
e3a4295527 Merge pull request #2513 from myhloli/dev
feat(docs): update changelog for PP-OCRv5 model support and handwritten document recognition enhancements
2025-05-24 15:55:39 +08:00
myhloli
73f0530d16 feat(docs): update changelog for PP-OCRv5 model support and handwritten document recognition enhancements 2025-05-24 15:47:31 +08:00
Xiaomeng Zhao
e92b5b698e Merge pull request #2512 from myhloli/dev
fix(ocr): adjust area ratio threshold and update fitz document handling in image conversion
2025-05-24 13:46:17 +08:00
myhloli
1e01ffcf78 fix(ocr): adjust area ratio threshold and update fitz document handling in image conversion 2025-05-24 13:39:34 +08:00
Xiaomeng Zhao
04b81dc1ab Merge pull request #2511 from myhloli/dev
Merge pull request #10 from myhloli/img2text
2025-05-24 12:01:47 +08:00
Xiaomeng Zhao
90585b67a9 Merge pull request #2510 from myhloli/img2text
feat(ocr): add area ratio calculation for OCR results and enhance get_coords_and_area function
2025-05-24 12:00:34 +08:00
Xiaomeng Zhao
4949dd0c18 Merge pull request #10 from myhloli/img2text
feat(ocr): add area ratio calculation for OCR results and enhance get_coords_and_area function
2025-05-24 11:59:52 +08:00
myhloli
a2b848136b feat(ocr): add area ratio calculation for OCR results and enhance get_coords_and_area function 2025-05-24 11:58:02 +08:00
Xiaomeng Zhao
04a712f940 Merge pull request #2506 from myhloli/dev
feat(ocr): implement PPHGNetV2 architecture with multiple stages and layers
2025-05-23 18:09:27 +08:00
myhloli
27cad566fa feat(ocr): implement PPHGNetV2 architecture with multiple stages and layers 2025-05-23 18:06:21 +08:00
Xiaomeng Zhao
ea3003f6ef Merge pull request #2505 from myhloli/dev
feat(ocr): add PPHGNetV2_B4 backbone and update OCR models
2025-05-23 17:34:56 +08:00
myhloli
93ad41edce feat(ocr): add PPHGNetV2_B4 backbone and update OCR models
- Add PPHGNetV2_B4 backbone to the list of supported backbones
- Introduce new OCR model configuration for PP-OCRv5 with PPHGNetV2_B4
- Update existing model configurations to use the new backbone
- Modify RNN neck to support input with H > 1
- Adjust batch size for inference
2025-05-23 17:06:52 +08:00
Xiaomeng Zhao
8f8b8c4c1f Merge pull request #2501 from myhloli/dev
feat(ocr): add PP-OCRv5 models and update configurations
2025-05-22 17:40:43 +08:00
myhloli
048f6af406 feat(ocr): add PP-OCRv5 models and update configurations
- Add new PP-OCRv5 detection and recognition models
- Update arch_config.yaml with new model architectures
- Modify models_config.yml to include PP-OCRv5 models for ch_lite configuration- Change dictionary file for ch_lite to ppocrv5_dict.txt
2025-05-22 17:29:47 +08:00
Xiaomeng Zhao
b122b86e8a Merge pull request #2487 from myhloli/dev
fix(ocr_mkcontent): improve image handling and footnote integration in markdown output
2025-05-19 15:47:48 +08:00
myhloli
002333a8d7 fix(ocr_mkcontent): improve image handling and footnote integration in markdown output 2025-05-19 15:45:26 +08:00
Xiaomeng Zhao
e3f22e84ab Merge pull request #2468 from opendatalab/master
master->dev
2025-05-14 10:46:50 +08:00
myhloli
40851b1c61 Update version.py with new version 2025-05-14 02:34:34 +00:00
Xiaomeng Zhao
ea619281ef Merge pull request #2467 from opendatalab/release-1.3.11
Release 1.3.11
2025-05-14 10:33:00 +08:00
Xiaomeng Zhao
212cfcf24a Merge pull request #2466 from opendatalab/dev
docs(changelog): remove pdfminer.six version pinning from release notes
2025-05-14 10:32:31 +08:00
Xiaomeng Zhao
cda85d6262 Merge pull request #2465 from myhloli/dev
docs(changelog): remove pdfminer.six version pinning from release notes
2025-05-14 10:31:55 +08:00
myhloli
51ceb48014 docs(changelog): remove pdfminer.six version pinning from release notes 2025-05-14 10:30:55 +08:00
Xiaomeng Zhao
0b8c614280 Merge pull request #2464 from opendatalab/release-1.3.11
Release 1.3.11
2025-05-14 10:22:18 +08:00
Xiaomeng Zhao
c1b387abe6 Merge pull request #2451 from myhloli/dev
fix(modeling): escape backslashes in LaTeX command descriptions
2025-05-10 00:37:50 +08:00
myhloli
1ab54ac2e3 fix(modeling): escape backslashes in LaTeX command descriptions 2025-05-10 00:34:11 +08:00
myhloli
78a0208425 docs(installation): remove numpy version restriction from PyTorch installation instructions 2025-05-10 00:28:55 +08:00
Xiaomeng Zhao
cd785f6af8 Merge pull request #2450 from myhloli/dev
fix(requirements): update pdfminer.six version and restrict torch version upper limit
2025-05-09 23:58:42 +08:00
myhloli
a8f752f753 fix(requirements): update pdfminer.six version and restrict torch version upper limit 2025-05-09 23:57:22 +08:00
Xiaomeng Zhao
65f332ffae Merge pull request #2449 from myhloli/dev
fix(setup): update python_requires to support Python 3.10 to 3.13
2025-05-09 23:45:16 +08:00
myhloli
c4b04ae642 Merge remote-tracking branch 'origin/dev' into dev 2025-05-09 23:38:50 +08:00
myhloli
3858d918dd fix(setup): update python_requires to support Python 3.10 to 3.13 2025-05-09 23:38:37 +08:00
Xiaomeng Zhao
70696165c7 Merge pull request #2446 from myhloli/dev
fix(Dockerfile): update modelscope installation command to use mirror
2025-05-09 18:23:08 +08:00
myhloli
b799d302c2 Merge remote-tracking branch 'origin/dev' into dev 2025-05-09 17:35:01 +08:00
myhloli
9351d64a41 fix(Dockerfile): update modelscope installation command to use mirror 2025-05-09 17:33:47 +08:00
Xiaomeng Zhao
3230793b55 Merge pull request #2440 from myhloli/dev
docs(installation): update Python version and CUDA installation instructions
2025-05-09 11:10:09 +08:00
myhloli
9f0d45bb58 docs(installation): update Python version and CUDA installation instructions 2025-05-09 10:48:14 +08:00
Xiaomeng Zhao
6c9645aa0c Merge pull request #2437 from myhloli/dev
docs(README): reorder installation commands for clarity
2025-05-08 18:56:34 +08:00
myhloli
96fb646a86 Merge remote-tracking branch 'origin/dev' into dev 2025-05-08 18:55:49 +08:00
myhloli
71a429a32e docs(README): reorder installation commands for clarity 2025-05-08 18:54:39 +08:00
Xiaomeng Zhao
201e338b3a Merge pull request #2429 from myhloli/dev
feat(modeling): add regex patterns for LaTeX symbol replacements
2025-05-08 11:27:57 +08:00
myhloli
2a28f604c6 feat(modeling): add regex patterns for LaTeX symbol replacements 2025-05-08 11:26:42 +08:00
Xiaomeng Zhao
38d0a622d9 Merge pull request #2423 from myhloli/dev
feat(modeling): add 'protect' command to removal patterns
2025-05-06 18:22:18 +08:00
myhloli
a8ca183094 feat(modeling): add 'protect' command to removal patterns 2025-05-06 18:21:03 +08:00
Xiaomeng Zhao
11bf98d0aa Merge pull request #2411 from CharlesKeeling65/patch-1
Update app.py: Fix parameter parsing in /file_parse endpoint
2025-04-30 17:51:08 +08:00
github-actions[bot]
50700646e4 @CharlesKeeling65 has signed the CLA in opendatalab/MinerU#2411 2025-04-30 09:25:44 +00:00
Wang Yubo
862891e294 Update app.py: Fix parameter parsing in /file_parse endpoint
I have updated the `/file_parse` endpoint in `app.py` to correctly handle boolean and string parameters when they are sent via `multipart/form-data` requests (commonly used for file uploads). Previously, these parameters were not being properly parsed because FastAPI expects them to be passed as query or JSON body parameters by default.

### Changes Made:
- Added `Form(...)` to all non-file parameters (`parse_method`, `is_json_md_dump`, `output_dir`, and return flags like `return_layout`, etc.).
- This ensures that FastAPI correctly reads these fields from form-data, allowing clients to send both files and structured configuration options in the same request.

### Why This Change Was Needed:
- When using `requests.post(..., data=data, files=files)`, the `data` dictionary is sent as form-encoded data.
- Without explicitly declaring these fields with `Form(...)`, FastAPI does not bind them correctly, leading to default values always being used (e.g., `False` for boolean flags).
- This change allows the API to accurately reflect the client's intent and enables features like `return_layout`, `return_images`, etc., to work as expected.

This update improves compatibility with HTTP clients that rely on standard form-based file upload mechanisms while preserving the existing behavior of the API.
2025-04-30 17:15:54 +08:00
Xiaomeng Zhao
f0b66d3aab Merge pull request #2410 from myhloli/dev
feat(model): add logging for batch image processing
2025-04-30 17:09:49 +08:00
myhloli
b29b73af21 feat(model): add logging for batch image processing
- Add logger info for each batch processed
- Include batch number and page count in log message
2025-04-30 17:08:20 +08:00
Xiaomeng Zhao
5e8656c74f Merge pull request #2406 from opendatalab/master
update version
2025-04-29 16:09:37 +08:00
myhloli
2aaf2310f2 Update version.py with new version 2025-04-29 08:06:04 +00:00
Xiaomeng Zhao
8802687934 Merge pull request #2404 from opendatalab/release-1.3.10
Release 1.3.10
2025-04-29 15:48:55 +08:00
Xiaomeng Zhao
2c2fcbe832 Merge pull request #2403 from myhloli/dev
feat(model_utils): adjust table detection threshold and add features
2025-04-29 15:27:44 +08:00
myhloli
9c37d65fab docs(README_zh-CN): update doc 2025-04-29 15:26:08 +08:00
myhloli
49a8f8be0a feat(model_utils): adjust table detection threshold and add features
- Adjust the threshold for considering tables inside other tables from2 to 3
- Add support for custom formula delimiters through user configuration
- Pin pdfminer.six to version 20250324 to prevent parsing failures
2025-04-29 15:24:28 +08:00
Xiaomeng Zhao
5e15d9b664 Merge pull request #2402 from myhloli/dev
build(deps): pin pdfminer.six version to 20250324
2025-04-29 14:56:21 +08:00
myhloli
81daf298b5 build(deps): pin pdfminer.six version to 20250324
- Update pdfminer.six dependency from >=20250416 to ==20250324
- This change ensures compatibility with specific project requirements
2025-04-29 14:55:07 +08:00
myhloli
2d4e9e544e Merge remote-tracking branch 'origin/dev' into dev 2025-04-29 10:54:34 +08:00
myhloli
dfd13fa2ab fix(mfr): add LaTeX symbol replacements for fint and up
- Add regex patterns for replacing LaTeX symbols \fint and \up with their Unicode equivalents
2025-04-29 10:53:40 +08:00
Xiaomeng Zhao
2cf55ce1d1 Merge pull request #2395 from myhloli/dev
feat(latex): enhance LaTeX delimiter support and configurability
2025-04-28 14:37:33 +08:00
myhloli
100e9c17a5 feat(latex): enhance LaTeX delimiter support and configurability
- Add support for \(\) and \[\] delimiters in addition to $$ and $$- Make LaTeX delimiter configuration more flexible and user-defined
- Update configuration file to include LaTeX delimiter settings
- Modify OCR content generation to use configurable delimiters
2025-04-28 14:35:39 +08:00
Xiaomeng Zhao
cf33cb882d Merge pull request #2389 from myhloli/dev
fix(mfr): add underscore symbol to unimernet
2025-04-28 01:56:17 +08:00
myhloli
98dd179053 Merge remote-tracking branch 'origin/dev' into dev 2025-04-28 01:55:20 +08:00
myhloli
7d77d614ec fix(mfr): add underscore symbol to unimernet
- Add \textunderscore to the list of LaTeX patterns
- This allows the model to properly render underscore characters
2025-04-28 01:54:29 +08:00
Xiaomeng Zhao
c060413b19 Merge pull request #2388 from opendatalab/master
update version
2025-04-27 18:30:05 +08:00
myhloli
1e715d026d Update version.py with new version 2025-04-27 10:23:03 +00:00
Xiaomeng Zhao
0d5762e57a Merge pull request #2381 from opendatalab/release-1.3.9
Release 1.3.9
2025-04-27 18:18:46 +08:00
Xiaomeng Zhao
d68fe15bde Merge pull request #2386 from opendatalab/dev
Dev
2025-04-27 18:18:28 +08:00
Xiaomeng Zhao
9bdc254456 Merge pull request #2385 from myhloli/dev
docs: correct typo for Apple Silicon in install guide and README
2025-04-27 18:18:01 +08:00
myhloli
ebb7df984e docs: correct typo for Apple Silicon in install guide and README
- Fix typo in install.rst and README_zh-CN.md
- Change 'apple slicon' to 'Apple silicon'
2025-04-27 18:16:46 +08:00
Xiaomeng Zhao
e54f8fd31e Merge pull request #2384 from opendatalab/dev
docs(README): fix typo
2025-04-27 18:14:46 +08:00
Xiaomeng Zhao
9f892a5e9d Merge pull request #2367 from kowyo/patch-1
docs(README): fix typo
2025-04-27 18:14:08 +08:00
Xiaomeng Zhao
623537dd9c Merge pull request #2383 from opendatalab/dev
update readme
2025-04-27 18:12:26 +08:00
Xiaomeng Zhao
c1fbf01c43 Merge pull request #2382 from myhloli/dev
feat(pdf): optimize formula parsing and update pdfminer.six
2025-04-27 18:11:47 +08:00
myhloli
0807e971fe feat(pdf): optimize formula parsing and update pdfminer.six
- Improve formula parsing success rate for better formula rendering
- Upgrade pdfminer.six to the latest version to fix PDF parsing issues- Update changelog in both English and Chinese README files
2025-04-27 18:10:02 +08:00
Xiaomeng Zhao
ef854b23aa Merge pull request #2380 from myhloli/dev
build(deps): update pdfminer.six to latest version
2025-04-27 17:38:42 +08:00
myhloli
2d1a0f2ca6 fix(mfr): optimize LaTeX formula repair functionality
- Improve \left and \right command handling in LaTeX formulas
- Enhance environment type matching for array, matrix, and other structures
- Refactor code for better readability and maintainability
2025-04-27 17:35:36 +08:00
myhloli
c8747cffb4 fix(magic_pdf): improve LaTeX formula processing and environment handling
- Refactor LaTeX left/right pair fixing logic for better balance
- Add environment detection and correction for common math environments
- Implement more robust whitespace handling and command substitution
- Optimize regex patterns for improved performance and readability
2025-04-27 17:10:15 +08:00
myhloli
0299dea199 build(deps): update pdfminer.six to latest version
- Change pdfminer.six dependency from ==20231228 to >=20250416
- This update ensures compatibility with the latest version of pdfminer.six
2025-04-27 16:38:34 +08:00
myhloli
2e91fb3f52 fix(mfr): improve LaTeX formula processing and repair
- Add functions to fix LaTeX left and right commands
- Implement brace matching and repair in LaTeX formulas
- Remove unnecessary whitespace and repair LaTeX code
- Replace specific LaTeX commands with appropriate alternatives
- Add logging for debugging purposes
2025-04-25 20:43:39 +08:00
myhloli
6c1511517a fix(mfr): improve LaTeX formula processing and repair
- Add functions to fix LaTeX left and right commands
- Implement brace matching and repair in LaTeX formulas
- Remove unnecessary whitespace and repair LaTeX code
- Replace specific LaTeX commands with appropriate alternatives
- Add logging for debugging purposes
2025-04-25 20:12:50 +08:00
github-actions[bot]
b864062a4f @kowyo has signed the CLA in opendatalab/MinerU#2367 2025-04-25 02:54:31 +00:00
小林在忙毕业设计
c1558af3ef docs(README): fix typo 2025-04-24 23:08:19 +08:00
Xiaomeng Zhao
2a9ac8939f Merge pull request #2365 from myhloli/dev
fix(mfr): improve LaTeX whitespace handling in unimernet model
2025-04-24 19:34:45 +08:00
myhloli
bfb80cb2e5 fix(mfr): improve LaTeX whitespace handling in unimernet model
- Preserve "\ " sequences during whitespace removal
- Add temporary substitution to prevent incorrect processing of "\ " sequences
- Restore "\ " sequences after removing unnecessary whitespace
2025-04-24 19:33:03 +08:00
Xiaomeng Zhao
80a80482f3 Merge pull request #2356 from opendatalab/master
master->dev
2025-04-23 18:50:42 +08:00
30 changed files with 19811 additions and 107 deletions

View File

@@ -48,6 +48,24 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
</div>
# Changelog
- 2025/05/24 1.3.12 Released
- Added support for ppocrv5 model, updated `ch_server` model to `PP-OCRv5_rec_server` and `ch_lite` model to `PP-OCRv5_rec_mobile` (model update required)
- In testing, we found that ppocrv5(server) shows some improvement for handwritten documents, but slightly lower accuracy than v4_server_doc for other document types. Therefore, the default ch model remains unchanged as `PP-OCRv4_server_rec_doc`.
- Since ppocrv5 enhances recognition capabilities for handwritten text and special characters, you can manually select ppocrv5 models for Japanese, traditional Chinese mixed scenarios and handwritten document scenarios
- You can select the appropriate model through the lang parameter `lang='ch_server'` (python api) or `--lang ch_server` (command line):
- `ch`: `PP-OCRv4_rec_server_doc` (default) (Chinese, English, Japanese, Traditional Chinese mixed/15k dictionary)
- `ch_server`: `PP-OCRv5_rec_server` (Chinese, English, Japanese, Traditional Chinese mixed + handwriting/18k dictionary)
- `ch_lite`: `PP-OCRv5_rec_mobile` (Chinese, English, Japanese, Traditional Chinese mixed + handwriting/18k dictionary)
- `ch_server_v4`: `PP-OCRv4_rec_server` (Chinese, English mixed/6k dictionary)
- `ch_lite_v4`: `PP-OCRv4_rec_mobile` (Chinese, English mixed/6k dictionary)
- Added support for handwritten documents by optimizing layout recognition of handwritten text areas
- This feature is supported by default, no additional configuration needed
- You can refer to the instructions above to manually select ppocrv5 model for better handwritten document parsing
- The demos on `huggingface` and `modelscope` have been updated to support handwriting recognition and ppocrv5 models, which you can experience online
- 2025/04/29 1.3.10 Released
- Support for custom formula delimiters can be achieved by modifying the `latex-delimiter-config` item in the `magic-pdf.json` file under the user directory.
- 2025/04/27 1.3.9 Released
- Optimized the formula parsing function to improve the success rate of formula rendering
- 2025/04/23 1.3.8 Released
- The default `ocr` model (`ch`) has been updated to `PP-OCRv4_server_rec_doc` (model update required)
- `PP-OCRv4_server_rec_doc` is trained on a mix of more Chinese document data and PP-OCR training data, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters. It supports over 15,000 recognizable characters, improving text recognition in documents while also boosting general text recognition.
@@ -349,7 +367,7 @@ There are three different ways to experience MinerU:
</tr>
<tr>
<td colspan="3">Python Version</td>
<td colspan="3">>=3.10</td>
<td colspan="3">3.10~3.13</td>
</tr>
<tr>
<td colspan="3">Nvidia Driver Version</td>
@@ -359,8 +377,7 @@ There are three different ways to experience MinerU:
</tr>
<tr>
<td colspan="3">CUDA Environment</td>
<td>11.8/12.4/12.6/12.8</td>
<td>11.8/12.4/12.6/12.8</td>
<td colspan="2"><a href="https://pytorch.org/get-started/locally/">Refer to the PyTorch official website</a></td>
<td>None</td>
</tr>
<tr>
@@ -374,7 +391,7 @@ There are three different ways to experience MinerU:
<td colspan="2">GPU VRAM 6GB or more</td>
<td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
More than 6GB VRAM </td>
<td rowspan="2">apple slicon</td>
<td rowspan="2">Apple silicon</td>
</tr>
</table>
@@ -391,7 +408,7 @@ Synced with dev branch updates:
#### 1. Install magic-pdf
```bash
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
pip install -U "magic-pdf[full]"
```

View File

@@ -47,6 +47,24 @@
</div>
# 更新记录
- 2025/05/24 1.3.12 发布
- 增加ppocrv5模型的支持`ch_server`模型更新为`PP-OCRv5_rec_server``ch_lite`模型更新为`PP-OCRv5_rec_mobile`(需更新模型)
- 在测试中发现ppocrv5(server)对手写文档效果有一定提升但在其余类别文档的精度略差于v4_server_doc因此默认的ch模型保持不变仍为`PP-OCRv4_server_rec_doc`
- 由于ppocrv5强化了手写场景和特殊字符的识别能力因此您可以在日繁混合场景以及手写文档场景下手动选择使用ppocrv5模型
- 您可通过lang参数`lang='ch_server'`(python api)或`--lang ch_server`(命令行)自行选择相应的模型:
- `ch` `PP-OCRv4_rec_server_doc`(默认)(中英日繁混合/1.5w字典)
- `ch_server` `PP-OCRv5_rec_server`(中英日繁混合+手写场景/1.8w字典)
- `ch_lite` `PP-OCRv5_rec_mobile`(中英日繁混合+手写场景/1.8w字典)
- `ch_server_v4` `PP-OCRv4_rec_server`(中英混合/6k字典
- `ch_lite_v4` `PP-OCRv4_rec_mobile`(中英混合/6k字典
- 增加手写文档的支持通过优化layout对手写文本区域的识别现已支持手写文档的解析
- 默认支持此功能,无需额外配置
- 可以参考上述说明手动选择ppocrv5模型以获得更好的手写文档解析效果
- `huggingface``modelscope`的demo已更新为支持手写识别和ppocrv5模型的版本可自行在线体验
- 2025/04/29 1.3.10 发布
- 支持使用自定义公式标识符,可通过修改用户目录下的`magic-pdf.json`文件中的`latex-delimiter-config`项实现。
- 2025/04/27 1.3.9 发布
- 优化公式解析功能,提升公式渲染的成功率
- 2025/04/23 1.3.8 发布
- `ocr`默认模型(`ch`)更新为`PP-OCRv4_server_rec_doc`(需更新模型)
- `PP-OCRv4_server_rec_doc`是在`PP-OCRv4_server_rec`的基础上在更多中文文档数据和PP-OCR训练数据的混合数据训练而成增加了部分繁体字、日文、特殊字符的识别能力可支持识别的字符为1.5万+,除文档相关的文字识别能力提升外,也同时提升了通用文字的识别能力。
@@ -338,7 +356,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
</tr>
<tr>
<td colspan="3">python版本</td>
<td colspan="3">>=3.10</td>
<td colspan="3">3.10~3.13</td>
</tr>
<tr>
<td colspan="3">Nvidia Driver 版本</td>
@@ -348,8 +366,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
</tr>
<tr>
<td colspan="3">CUDA环境</td>
<td>11.8/12.4/12.6/12.8</td>
<td>11.8/12.4/12.6/12.8</td>
<td colspan="2"><a href="https://pytorch.org/get-started/locally/">Refer to the PyTorch official website</a></td>
<td>None</td>
</tr>
<tr>
@@ -364,7 +381,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
<td colspan="2">
Volta(2017)及之后生产的全部带Tensor Core的GPU <br>
6G显存及以上</td>
<td rowspan="2">apple slicon</td>
<td rowspan="2">Apple silicon</td>
</tr>
</table>
@@ -384,7 +401,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
> 最新版本国内镜像源同步可能会有延迟,请耐心等待
```bash
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
pip install -U "magic-pdf[full]" -i https://mirrors.aliyun.com/pypi/simple
```

View File

@@ -45,7 +45,7 @@ RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/m
pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple"
# Download models and update the configuration file
RUN /bin/bash -c "pip3 install modelscope && \
RUN /bin/bash -c "pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
python3 download_models.py && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"

View File

@@ -54,7 +54,7 @@ In the final step, enter `yes`, close the terminal, and reopen it.
### 4. Create an Environment Using Conda
```bash
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
```

View File

@@ -54,7 +54,7 @@ bash Anaconda3-2024.06-1-Linux-x86_64.sh
## 4. 使用conda 创建环境
```bash
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
```

View File

@@ -2,11 +2,12 @@
### 1. Install CUDA and cuDNN
You need to install a CUDA version that is compatible with torch's requirements. Currently, torch supports CUDA 11.8/12.4/12.6.
You need to install a CUDA version that is compatible with torch's requirements. For details, please refer to the [official PyTorch website](https://pytorch.org/get-started/locally/).
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
### 2. Install Anaconda
@@ -17,7 +18,7 @@ Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86
### 3. Create an Environment Using Conda
```bash
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
@@ -63,7 +64,7 @@ If your graphics card has at least 6GB of VRAM, follow these steps to test CUDA-
1. **Overwrite the installation of torch and torchvision** supporting CUDA.(Please select the appropriate index-url based on your CUDA version. For more details, refer to the [PyTorch official website](https://pytorch.org/get-started/locally/).)
```
pip install --force-reinstall torch torchvision "numpy<=2.1.1" --index-url https://download.pytorch.org/whl/cu124
pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
```
2. **Modify the value of `"device-mode"`** in the `magic-pdf.json` configuration file located in your user directory.

View File

@@ -1,12 +1,13 @@
# Windows10/11
## 1. 安装cuda和cuDNN
## 1. 安装cuda环境
需要安装符合torch要求的cuda版本torch目前支持11.8/12.4/12.6
需要安装符合torch要求的cuda版本具体可参考[torch官网](https://pytorch.org/get-started/locally/)
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
## 2. 安装anaconda
@@ -18,7 +19,7 @@ https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Window
## 3. 使用conda 创建环境
```bash
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
@@ -64,7 +65,7 @@ pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
**1.覆盖安装支持cuda的torch和torchvision**(请根据cuda版本选择合适的index-url具体可参考[torch官网](https://pytorch.org/get-started/locally/))
```bash
pip install --force-reinstall torch torchvision "numpy<=2.1.1" --index-url https://download.pytorch.org/whl/cu124
pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
```
**2.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**

View File

@@ -20,6 +20,16 @@
"enable": true,
"max_time": 400
},
"latex-delimiter-config": {
"display": {
"left": "$$",
"right": "$$"
},
"inline": {
"left": "$",
"right": "$"
}
},
"llm-aided-config": {
"formula_aided": {
"api_key": "your_api_key",
@@ -40,5 +50,5 @@
"enable": false
}
},
"config_version": "1.2.0"
"config_version": "1.2.1"
}

View File

@@ -10,22 +10,22 @@ from loguru import logger
def fitz_doc_to_image(doc, dpi=200) -> dict:
def fitz_doc_to_image(page, dpi=200) -> dict:
"""Convert fitz.Document to image, Then convert the image to numpy array.
Args:
doc (_type_): pymudoc page
page (_type_): pymudoc page
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img': numpy array, 'width': width, 'height': height }
"""
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = doc.get_pixmap(matrix=mat, alpha=False)
pm = page.get_pixmap(matrix=mat, alpha=False)
# If the width or height exceeds 4500 after scaling, do not scale further.
if pm.width > 4500 or pm.height > 4500:
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
# Convert pixmap samples directly to numpy array
img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)

View File

@@ -5,6 +5,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.config_reader import get_latex_delimiter_config
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.post_proc.para_split_v3 import ListLineTag
@@ -69,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
if mode == 'nlp':
continue
elif mode == 'mm':
for block in para_block['blocks']: # 1st.拼image_body
if block['type'] == BlockType.ImageBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Image:
if span.get('image_path', ''):
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 3rd.拼image_footnote
if block['type'] == BlockType.ImageFootnote:
para_text += merge_para_with_text(block) + ' \n'
# 检测是否存在图片脚注
has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
# 如果存在图片脚注,则将图片脚注拼接到图片正文后面
if has_image_footnote:
for block in para_block['blocks']: # 1st.拼image_caption
if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼image_body
if block['type'] == BlockType.ImageBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Image:
if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 3rd.拼image_footnote
if block['type'] == BlockType.ImageFootnote:
para_text += ' \n' + merge_para_with_text(block)
else:
for block in para_block['blocks']: # 1st.拼image_body
if block['type'] == BlockType.ImageBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Image:
if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption:
para_text += ' \n' + merge_para_with_text(block)
elif para_type == BlockType.Table:
if mode == 'nlp':
continue
@@ -95,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
for span in line['spans']:
if span['type'] == ContentType.Table:
# if processed by table model
if span.get('latex', ''):
para_text += f"\n\n$\n {span['latex']}\n$\n\n"
elif span.get('html', ''):
para_text += f"\n\n{span['html']}\n\n"
if span.get('html', ''):
para_text += f"\n{span['html']}\n"
elif span.get('image_path', ''):
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block) + ' \n'
para_text += '\n' + merge_para_with_text(block) + ' '
if para_text.strip() == '':
continue
else:
page_markdown.append(para_text.strip() + ' ')
# page_markdown.append(para_text.strip() + ' ')
page_markdown.append(para_text.strip())
return page_markdown
@@ -145,6 +160,19 @@ def full_to_half(text: str) -> str:
result.append(char)
return ''.join(result)
latex_delimiters_config = get_latex_delimiter_config()
default_delimiters = {
'display': {'left': '$$', 'right': '$$'},
'inline': {'left': '$', 'right': '$'}
}
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
display_left_delimiter = delimiters['display']['left']
display_right_delimiter = delimiters['display']['right']
inline_left_delimiter = delimiters['inline']['left']
inline_right_delimiter = delimiters['inline']['right']
def merge_para_with_text(para_block):
block_text = ''
@@ -168,9 +196,9 @@ def merge_para_with_text(para_block):
if span_type == ContentType.Text:
content = ocr_escape_special_markdown_char(span['content'])
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
content = content.strip()
@@ -243,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
if span['type'] == ContentType.Table:
if span.get('latex', ''):
para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
para_content['table_body'] = f"{span['latex']}"
elif span.get('html', ''):
para_content['table_body'] = f"\n\n{span['html']}\n\n"
para_content['table_body'] = f"{span['html']}"
if span.get('image_path', ''):
para_content['img_path'] = join_path(img_buket_path, span['image_path'])

View File

@@ -125,6 +125,15 @@ def get_llm_aided_config():
else:
return llm_aided_config
def get_latex_delimiter_config():
config = read_config()
latex_delimiter_config = config.get('latex-delimiter-config')
if latex_delimiter_config is None:
logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
return None
else:
return latex_delimiter_config
if __name__ == '__main__':
ak, sk, endpoint = get_s3_config('llm-raw')

View File

@@ -1 +1 @@
__version__ = "1.3.8"
__version__ = "1.3.11"

View File

@@ -6,7 +6,7 @@ from tqdm import tqdm
from magic_pdf.config.constants import MODEL_NAME
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.model.sub_modules.model_utils import (
clean_vram, crop_img, get_res_list_from_layout_res)
clean_vram, crop_img, get_res_list_from_layout_res, get_coords_and_area)
from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
get_adjusted_mfdetrec_res, get_ocr_result_list)
@@ -148,6 +148,19 @@ class BatchAnalyze:
# Integration results
if ocr_res:
ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
if res["category_id"] == 3:
# ocr_result_list中所有bbox的面积之和
ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
# 求ocr_res_area和res的面积的比值
res_area = get_coords_and_area(res)[4]
if res_area > 0:
ratio = ocr_res_area / res_area
if ratio > 0.25:
res["category_id"] = 1
else:
continue
ocr_res_list_dict['layout_res'].extend(ocr_result_list)
# det_count += len(ocr_res_list_dict['ocr_res_list'])

View File

@@ -156,7 +156,10 @@ def doc_analyze(
batch_images = [images_with_extra_info]
results = []
for batch_image in batch_images:
processed_images_count = 0
for index, batch_image in enumerate(batch_images):
processed_images_count += len(batch_image)
logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
results.extend(result)
@@ -186,7 +189,7 @@ def batch_doc_analyze(
formula_enable=None,
table_enable=None,
):
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
batch_size = MIN_BATCH_INFERENCE_SIZE
page_wh_list = []

View File

@@ -5,6 +5,7 @@ from typing import Optional
import torch
from ftfy import fix_text
from loguru import logger
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
@@ -57,22 +58,322 @@ class TokenizerWrapper:
return toks
def latex_rm_whitespace(s: str):
"""Remove unnecessary whitespace from LaTeX code.
LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
def fix_latex_left_right(s):
"""
text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
letter = r'[a-zA-Z]'
noletter = r'[\W_^\d]'
names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
s = re.sub(text_reg, lambda _: str(names.pop(0)), s)
news = s
while True:
s = news
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
if news == s:
break
修复LaTeX中的\\left和\\right命令
1. 确保它们后面跟有效分隔符
2. 平衡\\left和\\right的数量
"""
# 白名单分隔符
valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
r'\Uparrow', r'\Downarrow', r'\|', r'\.']
# 为\left后缺失有效分隔符的情况添加点
def fix_delim(match, is_left=True):
cmd = match.group(1) # \left 或 \right
rest = match.group(2) if len(match.groups()) > 1 else ""
if not rest or rest not in valid_delims_list:
return cmd + "."
return match.group(0)
# 使用更精确的模式匹配\left和\right命令
# 确保它们是独立的命令,不是其他命令的一部分
# 使用预编译正则和统一回调函数
s = LEFT_PATTERN.sub(lambda m: fix_delim(m, True), s)
s = RIGHT_PATTERN.sub(lambda m: fix_delim(m, False), s)
# 更精确地计算\left和\right的数量
left_count = len(LEFT_COUNT_PATTERN.findall(s)) # 不匹配\lefteqn等
right_count = len(RIGHT_COUNT_PATTERN.findall(s)) # 不匹配\rightarrow等
if left_count == right_count:
# 如果数量相等,检查是否在同一组
return fix_left_right_pairs(s)
else:
# 如果数量不等,移除所有\left和\right
# logger.debug(f"latex:{s}")
# logger.warning(f"left_count: {left_count}, right_count: {right_count}")
return LEFT_RIGHT_REMOVE_PATTERN.sub('', s)
def fix_left_right_pairs(latex_formula):
"""
检测并修复LaTeX公式中\\left和\\right不在同一组的情况
Args:
latex_formula (str): 输入的LaTeX公式
Returns:
str: 修复后的LaTeX公式
"""
# 用于跟踪花括号嵌套层级
brace_stack = []
# 用于存储\left信息: (位置, 深度, 分隔符)
left_stack = []
# 存储需要调整的\right信息: (开始位置, 结束位置, 目标位置)
adjustments = []
i = 0
while i < len(latex_formula):
# 检查是否是转义字符
if i > 0 and latex_formula[i - 1] == '\\':
backslash_count = 0
j = i - 1
while j >= 0 and latex_formula[j] == '\\':
backslash_count += 1
j -= 1
if backslash_count % 2 == 1:
i += 1
continue
# 检测\left命令
if i + 5 < len(latex_formula) and latex_formula[i:i + 5] == "\\left" and i + 5 < len(latex_formula):
delimiter = latex_formula[i + 5]
left_stack.append((i, len(brace_stack), delimiter))
i += 6 # 跳过\left和分隔符
continue
# 检测\right命令
elif i + 6 < len(latex_formula) and latex_formula[i:i + 6] == "\\right" and i + 6 < len(latex_formula):
delimiter = latex_formula[i + 6]
if left_stack:
left_pos, left_depth, left_delim = left_stack.pop()
# 如果\left和\right不在同一花括号深度
if left_depth != len(brace_stack):
# 找到\left所在花括号组的结束位置
target_pos = find_group_end(latex_formula, left_pos, left_depth)
if target_pos != -1:
# 记录需要移动的\right
adjustments.append((i, i + 7, target_pos))
i += 7 # 跳过\right和分隔符
continue
# 处理花括号
if latex_formula[i] == '{':
brace_stack.append(i)
elif latex_formula[i] == '}':
if brace_stack:
brace_stack.pop()
i += 1
# 应用调整,从后向前处理以避免索引变化
if not adjustments:
return latex_formula
result = list(latex_formula)
adjustments.sort(reverse=True, key=lambda x: x[0])
for start, end, target in adjustments:
# 提取\right部分
right_part = result[start:end]
# 从原位置删除
del result[start:end]
# 在目标位置插入
result.insert(target, ''.join(right_part))
return ''.join(result)
def find_group_end(text, pos, depth):
"""查找特定深度的花括号组的结束位置"""
current_depth = depth
i = pos
while i < len(text):
if text[i] == '{' and (i == 0 or not is_escaped(text, i)):
current_depth += 1
elif text[i] == '}' and (i == 0 or not is_escaped(text, i)):
current_depth -= 1
if current_depth < depth:
return i
i += 1
return -1 # 未找到对应结束位置
def is_escaped(text, pos):
"""检查字符是否被转义"""
backslash_count = 0
j = pos - 1
while j >= 0 and text[j] == '\\':
backslash_count += 1
j -= 1
return backslash_count % 2 == 1
def fix_unbalanced_braces(latex_formula):
"""
检测LaTeX公式中的花括号是否闭合并删除无法配对的花括号
Args:
latex_formula (str): 输入的LaTeX公式
Returns:
str: 删除无法配对的花括号后的LaTeX公式
"""
stack = [] # 存储左括号的索引
unmatched = set() # 存储不匹配括号的索引
i = 0
while i < len(latex_formula):
# 检查是否是转义的花括号
if latex_formula[i] in ['{', '}']:
# 计算前面连续的反斜杠数量
backslash_count = 0
j = i - 1
while j >= 0 and latex_formula[j] == '\\':
backslash_count += 1
j -= 1
# 如果前面有奇数个反斜杠,则该花括号是转义的,不参与匹配
if backslash_count % 2 == 1:
i += 1
continue
# 否则,该花括号参与匹配
if latex_formula[i] == '{':
stack.append(i)
else: # latex_formula[i] == '}'
if stack: # 有对应的左括号
stack.pop()
else: # 没有对应的左括号
unmatched.add(i)
i += 1
# 所有未匹配的左括号
unmatched.update(stack)
# 构建新字符串,删除不匹配的括号
return ''.join(char for i, char in enumerate(latex_formula) if i not in unmatched)
def process_latex(input_string):
"""
处理LaTeX公式中的反斜杠
1. 如果\后跟特殊字符(#$%&~_^\\{})或空格,保持不变
2. 如果\后跟两个小写字母,保持不变
3. 其他情况,在\后添加空格
Args:
input_string (str): 输入的LaTeX公式
Returns:
str: 处理后的LaTeX公式
"""
def replace_func(match):
# 获取\后面的字符
next_char = match.group(1)
# 如果是特殊字符或空格,保持不变
if next_char in "#$%&~_^|\\{} \t\n\r\v\f":
return match.group(0)
# 如果是字母,检查下一个字符
if 'a' <= next_char <= 'z' or 'A' <= next_char <= 'Z':
pos = match.start() + 2 # \x后的位置
if pos < len(input_string) and ('a' <= input_string[pos] <= 'z' or 'A' <= input_string[pos] <= 'Z'):
# 下一个字符也是字母,保持不变
return match.group(0)
# 其他情况,在\后添加空格
return '\\' + ' ' + next_char
# 匹配\后面跟一个字符的情况
pattern = r'\\(.)'
return re.sub(pattern, replace_func, input_string)
# 常见的在KaTeX/MathJax中可用的数学环境
ENV_TYPES = ['array', 'matrix', 'pmatrix', 'bmatrix', 'vmatrix',
'Bmatrix', 'Vmatrix', 'cases', 'aligned', 'gathered']
ENV_BEGIN_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}') for env in ENV_TYPES}
ENV_END_PATTERNS = {env: re.compile(r'\\end\{' + env + r'\}') for env in ENV_TYPES}
ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') for env in ENV_TYPES}
def fix_latex_environments(s):
"""
检测LaTeX中环境如array\\begin和\\end是否匹配
1. 如果缺少\\begin标签则在开头添加
2. 如果缺少\\end标签则在末尾添加
"""
for env in ENV_TYPES:
begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
end_count = len(ENV_END_PATTERNS[env].findall(s))
if begin_count != end_count:
if end_count > begin_count:
format_match = ENV_FORMAT_PATTERNS[env].search(s)
default_format = '{c}' if env == 'array' else ''
format_str = '{' + format_match.group(1) + '}' if format_match else default_format
missing_count = end_count - begin_count
begin_command = '\\begin{' + env + '}' + format_str + ' '
s = begin_command * missing_count + s
else:
missing_count = begin_count - end_count
s = s + (' \\end{' + env + '}') * missing_count
return s
UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
COMMANDS_TO_REMOVE_PATTERN = re.compile(
r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
REPLACEMENTS_PATTERNS = {
re.compile(r'\\underbar'): r'\\underline',
re.compile(r'\\Bar'): r'\\hat',
re.compile(r'\\Hat'): r'\\hat',
re.compile(r'\\Tilde'): r'\\tilde',
re.compile(r'\\slash'): r'/',
re.compile(r'\\textperthousand'): r'',
re.compile(r'\\sun'): r'',
re.compile(r'\\textunderscore'): r'\\_',
re.compile(r'\\fint'): r'',
re.compile(r'\\up '): r'\\ ',
re.compile(r'\\vline = '): r'\\models ',
re.compile(r'\\vDash '): r'\\models ',
re.compile(r'\\sq \\sqcup '): r'\\square ',
}
QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
def latex_rm_whitespace(s: str):
"""Remove unnecessary whitespace from LaTeX code."""
s = fix_unbalanced_braces(s)
s = fix_latex_left_right(s)
s = fix_latex_environments(s)
# 使用预编译的正则表达式
s = UP_PATTERN.sub(
lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", s
)
s = COMMANDS_TO_REMOVE_PATTERN.sub('', s)
# 应用所有替换
for pattern, replacement in REPLACEMENTS_PATTERNS.items():
s = pattern.sub(replacement, s)
# 处理LaTeX中的反斜杠和空格
s = process_latex(s)
# \qquad后补空格
s = QQUAD_PATTERN.sub(r'\\qquad ', s)
return s

View File

@@ -31,10 +31,10 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
return return_image, return_list
def get_coords_and_area(table):
def get_coords_and_area(block_with_poly):
"""Extract coordinates and area from a table."""
xmin, ymin = int(table['poly'][0]), int(table['poly'][1])
xmax, ymax = int(table['poly'][4]), int(table['poly'][5])
xmin, ymin = int(block_with_poly['poly'][0]), int(block_with_poly['poly'][1])
xmax, ymax = int(block_with_poly['poly'][4]), int(block_with_poly['poly'][5])
area = (xmax - xmin) * (ymax - ymin)
return xmin, ymin, xmax, ymax, area
@@ -172,8 +172,8 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
tables_inside = [j for j in range(len(table_res_list))
if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
# Continue if there are at least 2 tables inside
if len(tables_inside) >= 2:
# Continue if there are at least 3 tables inside
if len(tables_inside) >= 3:
# Check if inside tables overlap with each other
tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
for idx1 in range(len(tables_inside))
@@ -243,7 +243,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
"bbox": [int(res['poly'][0]), int(res['poly'][1]),
int(res['poly'][4]), int(res['poly'][5])],
})
elif category_id in [0, 2, 4, 6, 7]: # OCR regions
elif category_id in [0, 2, 4, 6, 7, 3]: # OCR regions
ocr_res_list.append(res)
elif category_id == 5: # Table regions
table_res_list.append(res)

View File

@@ -35,7 +35,7 @@ def build_backbone(config, model_type):
from .rec_mobilenet_v3 import MobileNetV3
from .rec_svtrnet import SVTRNet
from .rec_mv1_enhance import MobileNetV1Enhance
from .rec_pphgnetv2 import PPHGNetV2_B4
support_dict = [
"MobileNetV1Enhance",
"MobileNetV3",
@@ -48,6 +48,7 @@ def build_backbone(config, model_type):
"DenseNet",
"PPLCNetV3",
"PPHGNet_small",
"PPHGNetV2_B4",
]
else:
raise NotImplementedError

View File

@@ -0,0 +1,810 @@
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2d):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if isinstance(self.output_size, int) and self.output_size == 1:
self._gap = True
elif (
isinstance(self.output_size, tuple)
and self.output_size[0] == 1
and self.output_size[1] == 1
):
self._gap = True
else:
self._gap = False
def forward(self, x):
if self._gap:
# Global Average Pooling
N, C, _, _ = x.shape
x_mean = torch.mean(x, dim=[2, 3])
x_mean = torch.reshape(x_mean, [N, C, 1, 1])
return x_mean
else:
return F.adaptive_avg_pool2d(
x,
output_size=self.output_size
)
class LearnableAffineBlock(nn.Module):
"""
Create a learnable affine block module. This module can significantly improve accuracy on smaller models.
Args:
scale_value (float): The initial value of the scale parameter, default is 1.0.
bias_value (float): The initial value of the bias parameter, default is 0.0.
lr_mult (float): The learning rate multiplier, default is 1.0.
lab_lr (float): The learning rate, default is 0.01.
"""
def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.01):
super().__init__()
self.scale = nn.Parameter(torch.Tensor([scale_value]))
self.bias = nn.Parameter(torch.Tensor([bias_value]))
def forward(self, x):
return self.scale * x + self.bias
class ConvBNAct(nn.Module):
"""
ConvBNAct is a combination of convolution and batchnorm layers.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_size (int): Size of the convolution kernel. Defaults to 3.
stride (int): Stride of the convolution. Defaults to 1.
padding (int/str): Padding or padding type for the convolution. Defaults to 1.
groups (int): Number of groups for the convolution. Defaults to 1.
use_act: (bool): Whether to use activation function. Defaults to True.
use_lab (bool): Whether to use the LAB operation. Defaults to False.
lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
"""
def __init__(
self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
groups=1,
use_act=True,
use_lab=False,
lr_mult=1.0,
):
super().__init__()
self.use_act = use_act
self.use_lab = use_lab
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding if isinstance(padding, str) else (kernel_size - 1) // 2,
# padding=(kernel_size - 1) // 2,
groups=groups,
bias=False,
)
self.bn = nn.BatchNorm2d(
out_channels,
)
if self.use_act:
self.act = nn.ReLU()
if self.use_lab:
self.lab = LearnableAffineBlock(lr_mult=lr_mult)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.use_act:
x = self.act(x)
if self.use_lab:
x = self.lab(x)
return x
class LightConvBNAct(nn.Module):
"""
LightConvBNAct is a combination of pw and dw layers.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_size (int): Size of the depth-wise convolution kernel.
use_lab (bool): Whether to use the LAB operation. Defaults to False.
lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
"""
def __init__(
self,
in_channels,
out_channels,
kernel_size,
use_lab=False,
lr_mult=1.0,
**kwargs,
):
super().__init__()
self.conv1 = ConvBNAct(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
use_act=False,
use_lab=use_lab,
lr_mult=lr_mult,
)
self.conv2 = ConvBNAct(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=kernel_size,
groups=out_channels,
use_act=True,
use_lab=use_lab,
lr_mult=lr_mult,
)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
return x
class CustomMaxPool2d(nn.Module):
def __init__(
self,
kernel_size,
stride=None,
padding=0,
dilation=1,
return_indices=False,
ceil_mode=False,
data_format="NCHW",
):
super(CustomMaxPool2d, self).__init__()
self.kernel_size = kernel_size if isinstance(kernel_size, (tuple, list)) else (kernel_size, kernel_size)
self.stride = stride if stride is not None else self.kernel_size
self.stride = self.stride if isinstance(self.stride, (tuple, list)) else (self.stride, self.stride)
self.dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
self.return_indices = return_indices
self.ceil_mode = ceil_mode
self.padding_mode = padding
# 当padding不是"same"时使用标准MaxPool2d
if padding != "same":
self.padding = padding if isinstance(padding, (tuple, list)) else (padding, padding)
self.pool = nn.MaxPool2d(
kernel_size=self.kernel_size,
stride=self.stride,
padding=self.padding,
dilation=self.dilation,
return_indices=self.return_indices,
ceil_mode=self.ceil_mode
)
def forward(self, x):
# 处理same padding
if self.padding_mode == "same":
input_height, input_width = x.size(2), x.size(3)
# 计算期望的输出尺寸
out_height = math.ceil(input_height / self.stride[0])
out_width = math.ceil(input_width / self.stride[1])
# 计算需要的padding
pad_height = max((out_height - 1) * self.stride[0] + self.kernel_size[0] - input_height, 0)
pad_width = max((out_width - 1) * self.stride[1] + self.kernel_size[1] - input_width, 0)
# 将padding分配到两边
pad_top = pad_height // 2
pad_bottom = pad_height - pad_top
pad_left = pad_width // 2
pad_right = pad_width - pad_left
# 应用padding
x = F.pad(x, (pad_left, pad_right, pad_top, pad_bottom))
# 使用标准max_pool2d函数
if self.return_indices:
return F.max_pool2d_with_indices(
x,
kernel_size=self.kernel_size,
stride=self.stride,
padding=0, # 已经手动pad过了
dilation=self.dilation,
ceil_mode=self.ceil_mode
)
else:
return F.max_pool2d(
x,
kernel_size=self.kernel_size,
stride=self.stride,
padding=0, # 已经手动pad过了
dilation=self.dilation,
ceil_mode=self.ceil_mode
)
else:
# 使用预定义的MaxPool2d
return self.pool(x)
class StemBlock(nn.Module):
"""
StemBlock for PP-HGNetV2.
Args:
in_channels (int): Number of input channels.
mid_channels (int): Number of middle channels.
out_channels (int): Number of output channels.
use_lab (bool): Whether to use the LAB operation. Defaults to False.
lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
"""
def __init__(
self,
in_channels,
mid_channels,
out_channels,
use_lab=False,
lr_mult=1.0,
text_rec=False,
):
super().__init__()
self.stem1 = ConvBNAct(
in_channels=in_channels,
out_channels=mid_channels,
kernel_size=3,
stride=2,
use_lab=use_lab,
lr_mult=lr_mult,
)
self.stem2a = ConvBNAct(
in_channels=mid_channels,
out_channels=mid_channels // 2,
kernel_size=2,
stride=1,
padding="same",
use_lab=use_lab,
lr_mult=lr_mult,
)
self.stem2b = ConvBNAct(
in_channels=mid_channels // 2,
out_channels=mid_channels,
kernel_size=2,
stride=1,
padding="same",
use_lab=use_lab,
lr_mult=lr_mult,
)
self.stem3 = ConvBNAct(
in_channels=mid_channels * 2,
out_channels=mid_channels,
kernel_size=3,
stride=1 if text_rec else 2,
use_lab=use_lab,
lr_mult=lr_mult,
)
self.stem4 = ConvBNAct(
in_channels=mid_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult,
)
self.pool = CustomMaxPool2d(
kernel_size=2, stride=1, ceil_mode=True, padding="same"
)
# self.pool = nn.MaxPool2d(
# kernel_size=2, stride=1, ceil_mode=True, padding=1
# )
def forward(self, x):
x = self.stem1(x)
x2 = self.stem2a(x)
x2 = self.stem2b(x2)
x1 = self.pool(x)
# if x1.shape[2:] != x2.shape[2:]:
# x1 = F.interpolate(x1, size=x2.shape[2:], mode='bilinear', align_corners=False)
x = torch.cat([x1, x2], 1)
x = self.stem3(x)
x = self.stem4(x)
return x
class HGV2_Block(nn.Module):
"""
HGV2_Block, the basic unit that constitutes the HGV2_Stage.
Args:
in_channels (int): Number of input channels.
mid_channels (int): Number of middle channels.
out_channels (int): Number of output channels.
kernel_size (int): Size of the convolution kernel. Defaults to 3.
layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
stride (int): Stride of the convolution. Defaults to 1.
padding (int/str): Padding or padding type for the convolution. Defaults to 1.
groups (int): Number of groups for the convolution. Defaults to 1.
use_act (bool): Whether to use activation function. Defaults to True.
use_lab (bool): Whether to use the LAB operation. Defaults to False.
lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
"""
def __init__(
self,
in_channels,
mid_channels,
out_channels,
kernel_size=3,
layer_num=6,
identity=False,
light_block=True,
use_lab=False,
lr_mult=1.0,
):
super().__init__()
self.identity = identity
self.layers = nn.ModuleList()
block_type = "LightConvBNAct" if light_block else "ConvBNAct"
for i in range(layer_num):
self.layers.append(
eval(block_type)(
in_channels=in_channels if i == 0 else mid_channels,
out_channels=mid_channels,
stride=1,
kernel_size=kernel_size,
use_lab=use_lab,
lr_mult=lr_mult,
)
)
# feature aggregation
total_channels = in_channels + layer_num * mid_channels
self.aggregation_squeeze_conv = ConvBNAct(
in_channels=total_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult,
)
self.aggregation_excitation_conv = ConvBNAct(
in_channels=out_channels // 2,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult,
)
def forward(self, x):
identity = x
output = []
output.append(x)
for layer in self.layers:
x = layer(x)
output.append(x)
x = torch.cat(output, dim=1)
x = self.aggregation_squeeze_conv(x)
x = self.aggregation_excitation_conv(x)
if self.identity:
x += identity
return x
class HGV2_Stage(nn.Module):
"""
HGV2_Stage, the basic unit that constitutes the PPHGNetV2.
Args:
in_channels (int): Number of input channels.
mid_channels (int): Number of middle channels.
out_channels (int): Number of output channels.
block_num (int): Number of blocks in the HGV2 stage.
layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
is_downsample (bool): Whether to use downsampling operation. Defaults to False.
light_block (bool): Whether to use light block. Defaults to True.
kernel_size (int): Size of the convolution kernel. Defaults to 3.
use_lab (bool, optional): Whether to use the LAB operation. Defaults to False.
lr_mult (float, optional): Learning rate multiplier for the layer. Defaults to 1.0.
"""
def __init__(
self,
in_channels,
mid_channels,
out_channels,
block_num,
layer_num=6,
is_downsample=True,
light_block=True,
kernel_size=3,
use_lab=False,
stride=2,
lr_mult=1.0,
):
super().__init__()
self.is_downsample = is_downsample
if self.is_downsample:
self.downsample = ConvBNAct(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=stride,
groups=in_channels,
use_act=False,
use_lab=use_lab,
lr_mult=lr_mult,
)
blocks_list = []
for i in range(block_num):
blocks_list.append(
HGV2_Block(
in_channels=in_channels if i == 0 else out_channels,
mid_channels=mid_channels,
out_channels=out_channels,
kernel_size=kernel_size,
layer_num=layer_num,
identity=False if i == 0 else True,
light_block=light_block,
use_lab=use_lab,
lr_mult=lr_mult,
)
)
self.blocks = nn.Sequential(*blocks_list)
def forward(self, x):
if self.is_downsample:
x = self.downsample(x)
x = self.blocks(x)
return x
class DropoutInferDownscale(nn.Module):
"""
实现与Paddle的mode="downscale_in_infer"等效的Dropout
训练模式out = input * mask直接应用掩码不进行放大
推理模式out = input * (1.0 - p)(在推理时按概率缩小)
"""
def __init__(self, p=0.5):
super().__init__()
self.p = p
def forward(self, x):
if self.training:
# 训练时应用随机mask但不放大
return F.dropout(x, self.p, training=True) * (1.0 - self.p)
else:
# 推理时按照dropout概率缩小输出
return x * (1.0 - self.p)
class PPHGNetV2(nn.Module):
"""
PPHGNetV2
Args:
stage_config (dict): Config for PPHGNetV2 stages. such as the number of channels, stride, etc.
stem_channels: (list): Number of channels of the stem of the PPHGNetV2.
use_lab (bool): Whether to use the LAB operation. Defaults to False.
use_last_conv (bool): Whether to use the last conv layer as the output channel. Defaults to True.
class_expand (int): Number of channels for the last 1x1 convolutional layer.
drop_prob (float): Dropout probability for the last 1x1 convolutional layer. Defaults to 0.0.
class_num (int): The number of classes for the classification layer. Defaults to 1000.
lr_mult_list (list): Learning rate multiplier for the stages. Defaults to [1.0, 1.0, 1.0, 1.0, 1.0].
Returns:
model: nn.Layer. Specific PPHGNetV2 model depends on args.
"""
def __init__(
self,
stage_config,
stem_channels=[3, 32, 64],
use_lab=False,
use_last_conv=True,
class_expand=2048,
dropout_prob=0.0,
class_num=1000,
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
det=False,
text_rec=False,
out_indices=None,
**kwargs,
):
super().__init__()
self.det = det
self.text_rec = text_rec
self.use_lab = use_lab
self.use_last_conv = use_last_conv
self.class_expand = class_expand
self.class_num = class_num
self.out_indices = out_indices if out_indices is not None else [0, 1, 2, 3]
self.out_channels = []
# stem
self.stem = StemBlock(
in_channels=stem_channels[0],
mid_channels=stem_channels[1],
out_channels=stem_channels[2],
use_lab=use_lab,
lr_mult=lr_mult_list[0],
text_rec=text_rec,
)
# stages
self.stages = nn.ModuleList()
for i, k in enumerate(stage_config):
(
in_channels,
mid_channels,
out_channels,
block_num,
is_downsample,
light_block,
kernel_size,
layer_num,
stride,
) = stage_config[k]
self.stages.append(
HGV2_Stage(
in_channels,
mid_channels,
out_channels,
block_num,
layer_num,
is_downsample,
light_block,
kernel_size,
use_lab,
stride,
lr_mult=lr_mult_list[i + 1],
)
)
if i in self.out_indices:
self.out_channels.append(out_channels)
if not self.det:
self.out_channels = stage_config["stage4"][2]
self.avg_pool = AdaptiveAvgPool2D(1)
if self.use_last_conv:
self.last_conv = nn.Conv2d(
in_channels=out_channels,
out_channels=self.class_expand,
kernel_size=1,
stride=1,
padding=0,
bias=False,
)
self.act = nn.ReLU()
if self.use_lab:
self.lab = LearnableAffineBlock()
self.dropout = DropoutInferDownscale(p=dropout_prob)
self.flatten = nn.Flatten(start_dim=1, end_dim=-1)
if not self.det:
self.fc = nn.Linear(
self.class_expand if self.use_last_conv else out_channels,
self.class_num,
)
self._init_weights()
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Linear):
nn.init.zeros_(m.bias)
def forward(self, x):
x = self.stem(x)
out = []
for i, stage in enumerate(self.stages):
x = stage(x)
if self.det and i in self.out_indices:
out.append(x)
if self.det:
return out
if self.text_rec:
if self.training:
x = F.adaptive_avg_pool2d(x, [1, 40])
else:
x = F.avg_pool2d(x, [3, 2])
return x
def PPHGNetV2_B0(pretrained=False, use_ssld=False, **kwargs):
"""
PPHGNetV2_B0
Args:
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
If str, means the path of the pretrained model.
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
Returns:
model: nn.Layer. Specific `PPHGNetV2_B0` model depends on args.
"""
stage_config = {
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
"stage1": [16, 16, 64, 1, False, False, 3, 3],
"stage2": [64, 32, 256, 1, True, False, 3, 3],
"stage3": [256, 64, 512, 2, True, True, 5, 3],
"stage4": [512, 128, 1024, 1, True, True, 5, 3],
}
model = PPHGNetV2(
stem_channels=[3, 16, 16], stage_config=stage_config, use_lab=True, **kwargs
)
return model
def PPHGNetV2_B1(pretrained=False, use_ssld=False, **kwargs):
"""
PPHGNetV2_B1
Args:
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
If str, means the path of the pretrained model.
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
Returns:
model: nn.Layer. Specific `PPHGNetV2_B1` model depends on args.
"""
stage_config = {
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
"stage1": [32, 32, 64, 1, False, False, 3, 3],
"stage2": [64, 48, 256, 1, True, False, 3, 3],
"stage3": [256, 96, 512, 2, True, True, 5, 3],
"stage4": [512, 192, 1024, 1, True, True, 5, 3],
}
model = PPHGNetV2(
stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
)
return model
def PPHGNetV2_B2(pretrained=False, use_ssld=False, **kwargs):
"""
PPHGNetV2_B2
Args:
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
If str, means the path of the pretrained model.
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
Returns:
model: nn.Layer. Specific `PPHGNetV2_B2` model depends on args.
"""
stage_config = {
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
"stage1": [32, 32, 96, 1, False, False, 3, 4],
"stage2": [96, 64, 384, 1, True, False, 3, 4],
"stage3": [384, 128, 768, 3, True, True, 5, 4],
"stage4": [768, 256, 1536, 1, True, True, 5, 4],
}
model = PPHGNetV2(
stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
)
return model
def PPHGNetV2_B3(pretrained=False, use_ssld=False, **kwargs):
"""
PPHGNetV2_B3
Args:
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
If str, means the path of the pretrained model.
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
Returns:
model: nn.Layer. Specific `PPHGNetV2_B3` model depends on args.
"""
stage_config = {
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
"stage1": [32, 32, 128, 1, False, False, 3, 5],
"stage2": [128, 64, 512, 1, True, False, 3, 5],
"stage3": [512, 128, 1024, 3, True, True, 5, 5],
"stage4": [1024, 256, 2048, 1, True, True, 5, 5],
}
model = PPHGNetV2(
stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
)
return model
def PPHGNetV2_B4(pretrained=False, use_ssld=False, det=False, text_rec=False, **kwargs):
"""
PPHGNetV2_B4
Args:
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
If str, means the path of the pretrained model.
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
Returns:
model: nn.Layer. Specific `PPHGNetV2_B4` model depends on args.
"""
stage_config_rec = {
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num, stride
"stage1": [48, 48, 128, 1, True, False, 3, 6, [2, 1]],
"stage2": [128, 96, 512, 1, True, False, 3, 6, [1, 2]],
"stage3": [512, 192, 1024, 3, True, True, 5, 6, [2, 1]],
"stage4": [1024, 384, 2048, 1, True, True, 5, 6, [2, 1]],
}
stage_config_det = {
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
"stage1": [48, 48, 128, 1, False, False, 3, 6, 2],
"stage2": [128, 96, 512, 1, True, False, 3, 6, 2],
"stage3": [512, 192, 1024, 3, True, True, 5, 6, 2],
"stage4": [1024, 384, 2048, 1, True, True, 5, 6, 2],
}
model = PPHGNetV2(
stem_channels=[3, 32, 48],
stage_config=stage_config_det if det else stage_config_rec,
use_lab=False,
det=det,
text_rec=text_rec,
**kwargs,
)
return model
def PPHGNetV2_B5(pretrained=False, use_ssld=False, **kwargs):
"""
PPHGNetV2_B5
Args:
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
If str, means the path of the pretrained model.
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
Returns:
model: nn.Layer. Specific `PPHGNetV2_B5` model depends on args.
"""
stage_config = {
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
"stage1": [64, 64, 128, 1, False, False, 3, 6],
"stage2": [128, 128, 512, 2, True, False, 3, 6],
"stage3": [512, 256, 1024, 5, True, True, 5, 6],
"stage4": [1024, 512, 2048, 2, True, True, 5, 6],
}
model = PPHGNetV2(
stem_channels=[3, 32, 64], stage_config=stage_config, use_lab=False, **kwargs
)
return model
def PPHGNetV2_B6(pretrained=False, use_ssld=False, **kwargs):
"""
PPHGNetV2_B6
Args:
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
If str, means the path of the pretrained model.
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
Returns:
model: nn.Layer. Specific `PPHGNetV2_B6` model depends on args.
"""
stage_config = {
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
"stage1": [96, 96, 192, 2, False, False, 3, 6],
"stage2": [192, 192, 512, 3, True, False, 3, 6],
"stage3": [512, 384, 1024, 6, True, True, 5, 6],
"stage4": [1024, 768, 2048, 3, True, True, 5, 6],
}
model = PPHGNetV2(
stem_channels=[3, 48, 96], stage_config=stage_config, use_lab=False, **kwargs
)
return model

View File

@@ -9,14 +9,27 @@ class Im2Seq(nn.Module):
super().__init__()
self.out_channels = in_channels
# def forward(self, x):
# B, C, H, W = x.shape
# # assert H == 1
# x = x.squeeze(dim=2)
# # x = x.transpose([0, 2, 1]) # paddle (NTC)(batch, width, channels)
# x = x.permute(0, 2, 1)
# return x
def forward(self, x):
B, C, H, W = x.shape
# assert H == 1
x = x.squeeze(dim=2)
# x = x.transpose([0, 2, 1]) # paddle (NTC)(batch, width, channels)
x = x.permute(0, 2, 1)
return x
# 处理四维张量,将空间维度展平为序列
if H == 1:
# 原来的处理逻辑适用于H=1的情况
x = x.squeeze(dim=2)
x = x.permute(0, 2, 1) # (B, W, C)
else:
# 处理H不为1的情况
x = x.permute(0, 2, 3, 1) # (B, H, W, C)
x = x.reshape(B, H * W, C) # (B, H*W, C)
return x
class EncoderWithRNN_(nn.Module):
def __init__(self, in_channels, hidden_size):

View File

@@ -104,6 +104,22 @@ ch_PP-OCRv4_det_infer:
name: DBHead
k: 50
ch_PP-OCRv5_det_infer:
model_type: det
algorithm: DB
Transform: null
Backbone:
name: PPLCNetV3
scale: 0.75
det: True
Neck:
name: RSEFPN
out_channels: 96
shortcut: True
Head:
name: DBHead
k: 50
ch_PP-OCRv4_det_server_infer:
model_type: det
algorithm: DB
@@ -196,6 +212,58 @@ ch_PP-OCRv4_rec_server_doc_infer:
nrtr_dim: 384
max_text_length: 25
ch_PP-OCRv5_rec_server_infer:
model_type: rec
algorithm: SVTR_HGNet
Transform:
Backbone:
name: PPHGNetV2_B4
text_rec: True
Head:
name: MultiHead
out_channels_list:
CTCLabelDecode: 18385
head_list:
- CTCHead:
Neck:
name: svtr
dims: 120
depth: 2
hidden_dims: 120
kernel_size: [ 1, 3 ]
use_guide: True
Head:
fc_decay: 0.00001
- NRTRHead:
nrtr_dim: 384
max_text_length: 25
ch_PP-OCRv5_rec_infer:
model_type: rec
algorithm: SVTR_HGNet
Transform:
Backbone:
name: PPLCNetV3
scale: 0.95
Head:
name: MultiHead
out_channels_list:
CTCLabelDecode: 18385
head_list:
- CTCHead:
Neck:
name: svtr
dims: 120
depth: 2
hidden_dims: 120
kernel_size: [ 1, 3 ]
use_guide: True
Head:
fc_decay: 0.00001
- NRTRHead:
nrtr_dim: 384
max_text_length: 25
chinese_cht_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR

View File

@@ -1,9 +1,17 @@
lang:
ch_lite:
det: ch_PP-OCRv3_det_infer.pth
rec: ch_PP-OCRv5_rec_infer.pth
dict: ppocrv5_dict.txt
ch_lite_v4:
det: ch_PP-OCRv3_det_infer.pth
rec: ch_PP-OCRv4_rec_infer.pth
dict: ppocr_keys_v1.txt
ch_server:
det: ch_PP-OCRv3_det_infer.pth
rec: ch_PP-OCRv5_rec_server_infer.pth
dict: ppocrv5_dict.txt
ch_server_v4:
det: ch_PP-OCRv3_det_infer.pth
rec: ch_PP-OCRv4_rec_server_infer.pth
dict: ppocr_keys_v1.txt

View File

@@ -76,11 +76,11 @@ In the final step, enter ``yes``, close the terminal, and reopen it.
4. Create an Environment Using Conda
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Specify Python version 3.10.
Specify Python version 3.103.13.
.. code:: sh
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
5. Install Applications
@@ -155,14 +155,15 @@ to test CUDA acceleration:
Windows 10/11
--------------
1. Install CUDA and cuDNN
1. Install CUDA
~~~~~~~~~~~~~~~~~~~~~~~~~
You need to install a CUDA version that is compatible with torch's requirements. Currently, torch supports CUDA 11.8/12.4/12.6.
You need to install a CUDA version that is compatible with torch's requirements. For details, please refer to the [official PyTorch website](https://pytorch.org/get-started/locally/).
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
2. Install Anaconda
@@ -177,7 +178,7 @@ Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86
::
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
4. Install Applications

View File

@@ -61,7 +61,7 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
</tr>
<tr>
<td colspan="3">Python Version</td>
<td colspan="3">3.10~3.12</td>
<td colspan="3">3.10~3.13</td>
</tr>
<tr>
<td colspan="3">Nvidia Driver Version</td>
@@ -71,8 +71,7 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
</tr>
<tr>
<td colspan="3">CUDA Environment</td>
<td>11.8/12.4/12.6/12.8</td>
<td>11.8/12.4/12.6/12.8</td>
<td colspan="2"><a href="https://pytorch.org/get-started/locally/">Refer to the PyTorch official website</a></td>
<td>None</td>
</tr>
<tr>
@@ -86,7 +85,7 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
<td colspan="2">GPU VRAM 6GB or more</td>
<td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
More than 6GB VRAM </td>
<td rowspan="2">apple slicon</td>
<td rowspan="2">Apple silicon</td>
</tr>
</table>
@@ -97,7 +96,7 @@ Create an environment
.. code-block:: shell
conda create -n mineru 'python>=3.10' -y
conda create -n mineru 'python=3.12' -y
conda activate mineru
pip install -U "magic-pdf[full]"

View File

@@ -117,8 +117,12 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
return md_content, txt_content, archive_zip_path, new_pdf_path
latex_delimiters = [{'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False}]
latex_delimiters = [
{'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False},
{'left': '\\(', 'right': '\\)', 'display': False},
{'left': '\\[', 'right': '\\]', 'display': True},
]
def init_model():
@@ -218,7 +222,8 @@ if __name__ == '__main__':
with gr.Tabs():
with gr.Tab('Markdown rendering'):
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
latex_delimiters=latex_delimiters, line_breaks=True)
latex_delimiters=latex_delimiters,
line_breaks=True)
with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True)
file.change(fn=to_pdf, inputs=file, outputs=pdf_show)

View File

@@ -4,9 +4,7 @@
## 环境配置
请使用以下命令配置所需的环境:
```bash
pip install -U litserve python-multipart filetype
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118
pip install -U magic-pdf[full] litserve python-multipart filetype
```
## 快速使用

View File

@@ -21,6 +21,7 @@ from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.operators.models import InferenceResult
from magic_pdf.operators.pipes import PipeResult
from fastapi import Form
model_config.__use_inside_model__ = True
@@ -102,6 +103,7 @@ def init_writers(
# 处理上传的文件
file_bytes = file.file.read()
file_extension = os.path.splitext(file.filename)[1]
writer = FileBasedDataWriter(output_path)
image_writer = FileBasedDataWriter(output_image_path)
os.makedirs(output_image_path, exist_ok=True)
@@ -176,14 +178,14 @@ def encode_image(image_path: str) -> str:
)
async def file_parse(
file: UploadFile = None,
file_path: str = None,
parse_method: str = "auto",
is_json_md_dump: bool = False,
output_dir: str = "output",
return_layout: bool = False,
return_info: bool = False,
return_content_list: bool = False,
return_images: bool = False,
file_path: str = Form(None),
parse_method: str = Form("auto"),
is_json_md_dump: bool = Form(False),
output_dir: str = Form("output"),
return_layout: bool = Form(False),
return_info: bool = Form(False),
return_content_list: bool = Form(False),
return_images: bool = Form(False),
):
"""
Execute the process of converting PDF to JSON and MD, outputting MD and JSON files

View File

@@ -7,9 +7,9 @@ numpy>=1.21.6
pydantic>=2.7.2,<2.11
PyMuPDF>=1.24.9,<1.25.0
scikit-learn>=1.0.2
torch>=2.2.2,!=2.5.0,!=2.5.1
torch>=2.2.2,!=2.5.0,!=2.5.1,<3
torchvision
transformers>=4.49.0,!=4.51.0,<5.0.0
pdfminer.six==20231228
pdfminer.six==20250506
tqdm>=4.67.1
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.

View File

@@ -81,7 +81,7 @@ if __name__ == '__main__':
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
],
python_requires=">=3.10,<4", # 项目依赖的 Python 版本
python_requires=">=3.10,<3.14", # 项目依赖的 Python 版本
entry_points={
"console_scripts": [
"magic-pdf = magic_pdf.tools.cli:cli",

View File

@@ -247,6 +247,22 @@
"created_at": "2025-04-17T03:54:59Z",
"repoId": 765083837,
"pullRequestNo": 2267
},
{
"name": "kowyo",
"id": 110339237,
"comment_id": 2829263082,
"created_at": "2025-04-25T02:54:20Z",
"repoId": 765083837,
"pullRequestNo": 2367
},
{
"name": "CharlesKeeling65",
"id": 94165417,
"comment_id": 2841356871,
"created_at": "2025-04-30T09:25:31Z",
"repoId": 765083837,
"pullRequestNo": 2411
}
]
}