mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
7 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
55fcb7387f | ||
|
|
f2169686e1 | ||
|
|
9c4e779b91 | ||
|
|
8d9070db10 | ||
|
|
69cdea908d | ||
|
|
1d1c7ba9ab | ||
|
|
4d5fd0ee55 |
@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
||||
</div>
|
||||
|
||||
# Changelog
|
||||
- 2025/04/22 1.3.7 Released
|
||||
- Fixed the issue where the `lang` parameter was ineffective during table parsing model initialization.
|
||||
- Fixed the significant slowdown in OCR and table parsing speed in `cpu` mode.
|
||||
- 2025/04/16 1.3.4 Released
|
||||
- Slightly improved the speed of OCR detection by removing some unused blocks.
|
||||
- Fixed page-level sorting errors caused by footnotes in certain cases.
|
||||
|
||||
@@ -47,6 +47,9 @@
|
||||
</div>
|
||||
|
||||
# 更新记录
|
||||
- 2025/04/22 1.3.7 发布
|
||||
- 修复表格解析模型初始化时lang参数失效的问题
|
||||
- 修复在`cpu`模式下ocr和表格解析速度大幅下降的问题
|
||||
- 2025/04/16 1.3.4 发布
|
||||
- 通过移除一些无用的块,小幅提升了ocr-det的速度
|
||||
- 修复部分情况下由footnote导致的页面内排序错误
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "1.3.5"
|
||||
__version__ = "1.3.6"
|
||||
|
||||
@@ -161,20 +161,13 @@ class BatchAnalyze:
|
||||
for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"):
|
||||
_lang = table_res_dict['lang']
|
||||
atom_model_manager = AtomModelSingleton()
|
||||
ocr_engine = atom_model_manager.get_atom_model(
|
||||
atom_model_name='ocr',
|
||||
ocr_show_log=False,
|
||||
det_db_box_thresh=0.5,
|
||||
det_db_unclip_ratio=1.6,
|
||||
lang=_lang
|
||||
)
|
||||
table_model = atom_model_manager.get_atom_model(
|
||||
atom_model_name='table',
|
||||
table_model_name='rapid_table',
|
||||
table_model_path='',
|
||||
table_max_time=400,
|
||||
device='cpu',
|
||||
ocr_engine=ocr_engine,
|
||||
lang=_lang,
|
||||
table_sub_model_name='slanet_plus'
|
||||
)
|
||||
html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(table_res_dict['table_img'])
|
||||
|
||||
@@ -53,6 +53,11 @@ class PytorchPaddleOCR(TextSystem):
|
||||
args = parser.parse_args(args)
|
||||
|
||||
self.lang = kwargs.get('lang', 'ch')
|
||||
|
||||
device = get_device()
|
||||
if device == 'cpu' and self.lang == 'ch':
|
||||
self.lang = 'ch_lite'
|
||||
|
||||
if self.lang in latin_lang:
|
||||
self.lang = 'latin'
|
||||
elif self.lang in arabic_lang:
|
||||
|
||||
Reference in New Issue
Block a user