mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
refactor: expand OCR text conditions for category assignment in analysis scripts
This commit is contained in:
@@ -290,7 +290,8 @@ def _process_ocr_and_formulas(
|
|||||||
if (
|
if (
|
||||||
ocr_text in [
|
ocr_text in [
|
||||||
'(204号', '(20', '(2', '(2号', '(20号', '号','(204',
|
'(204号', '(20', '(2', '(2号', '(20号', '号','(204',
|
||||||
'(cid:)',
|
'(cid:)', '(ci:)', '(cd:1)', 'cd:)', 'c)', '(cd:)', 'c', 'id:)',
|
||||||
|
':)', '√:)', '√i:)', '−i:)', '−:' , 'i:)',
|
||||||
]
|
]
|
||||||
and ocr_score < 0.8
|
and ocr_score < 0.8
|
||||||
and layout_res_width < layout_res_height
|
and layout_res_width < layout_res_height
|
||||||
|
|||||||
@@ -420,7 +420,15 @@ class BatchAnalyze:
|
|||||||
layout_res_item['poly'][4], layout_res_item['poly'][5]]
|
layout_res_item['poly'][4], layout_res_item['poly'][5]]
|
||||||
layout_res_width = layout_res_bbox[2] - layout_res_bbox[0]
|
layout_res_width = layout_res_bbox[2] - layout_res_bbox[0]
|
||||||
layout_res_height = layout_res_bbox[3] - layout_res_bbox[1]
|
layout_res_height = layout_res_bbox[3] - layout_res_bbox[1]
|
||||||
if ocr_text in ['(204号', '(20', '(2', '(2号', '(20号', '号', '(204'] and ocr_score < 0.8 and layout_res_width < layout_res_height:
|
if (
|
||||||
|
ocr_text in [
|
||||||
|
'(204号', '(20', '(2', '(2号', '(20号', '号', '(204',
|
||||||
|
'(cid:)', '(ci:)', '(cd:1)', 'cd:)', 'c)', '(cd:)', 'c', 'id:)',
|
||||||
|
':)', '√:)', '√i:)', '−i:)', '−:', 'i:)',
|
||||||
|
]
|
||||||
|
and ocr_score < 0.8
|
||||||
|
and layout_res_width < layout_res_height
|
||||||
|
):
|
||||||
layout_res_item['category_id'] = 16
|
layout_res_item['category_id'] = 16
|
||||||
|
|
||||||
total_processed += len(img_crop_list)
|
total_processed += len(img_crop_list)
|
||||||
|
|||||||
Reference in New Issue
Block a user