mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 02:58:54 +07:00
refactor: expand OCR text conditions for category assignment in analysis scripts
This commit is contained in:
@@ -290,7 +290,8 @@ def _process_ocr_and_formulas(
|
||||
if (
|
||||
ocr_text in [
|
||||
'(204号', '(20', '(2', '(2号', '(20号', '号','(204',
|
||||
'(cid:)',
|
||||
'(cid:)', '(ci:)', '(cd:1)', 'cd:)', 'c)', '(cd:)', 'c', 'id:)',
|
||||
':)', '√:)', '√i:)', '−i:)', '−:' , 'i:)',
|
||||
]
|
||||
and ocr_score < 0.8
|
||||
and layout_res_width < layout_res_height
|
||||
|
||||
@@ -420,7 +420,15 @@ class BatchAnalyze:
|
||||
layout_res_item['poly'][4], layout_res_item['poly'][5]]
|
||||
layout_res_width = layout_res_bbox[2] - layout_res_bbox[0]
|
||||
layout_res_height = layout_res_bbox[3] - layout_res_bbox[1]
|
||||
if ocr_text in ['(204号', '(20', '(2', '(2号', '(20号', '号', '(204'] and ocr_score < 0.8 and layout_res_width < layout_res_height:
|
||||
if (
|
||||
ocr_text in [
|
||||
'(204号', '(20', '(2', '(2号', '(20号', '号', '(204',
|
||||
'(cid:)', '(ci:)', '(cd:1)', 'cd:)', 'c)', '(cd:)', 'c', 'id:)',
|
||||
':)', '√:)', '√i:)', '−i:)', '−:', 'i:)',
|
||||
]
|
||||
and ocr_score < 0.8
|
||||
and layout_res_width < layout_res_height
|
||||
):
|
||||
layout_res_item['category_id'] = 16
|
||||
|
||||
total_processed += len(img_crop_list)
|
||||
|
||||
Reference in New Issue
Block a user