mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
refactor(ocr):Increase the dilation factor in OCR to address the issue of word concatenation.
- Remove unused functions such as split_long_words, ocr_mk_mm_markdown_with_para, etc. - Simplify ocr_mk_markdown_with_para_core_v2 by removing unnecessary language detection and word splitting logic- Remove wordninja dependency from requirements - Update ocr_model_init to include additional parameters for OCR model configuration
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
import re
|
||||
|
||||
import wordninja
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.commons import join_path
|
||||
@@ -25,37 +24,6 @@ def __is_hyphen_at_line_end(line):
|
||||
return bool(re.search(r'[A-Za-z]+-\s*$', line))
|
||||
|
||||
|
||||
def split_long_words(text):
|
||||
segments = text.split(' ')
|
||||
for i in range(len(segments)):
|
||||
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
|
||||
for j in range(len(words)):
|
||||
if len(words[j]) > 10:
|
||||
words[j] = ' '.join(wordninja.split(words[j]))
|
||||
segments[i] = ''.join(words)
|
||||
return ' '.join(segments)
|
||||
|
||||
|
||||
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
|
||||
markdown = []
|
||||
for page_info in pdf_info_list:
|
||||
paras_of_layout = page_info.get('para_blocks')
|
||||
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
||||
paras_of_layout, 'mm', img_buket_path)
|
||||
markdown.extend(page_markdown)
|
||||
return '\n\n'.join(markdown)
|
||||
|
||||
|
||||
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
|
||||
markdown = []
|
||||
for page_info in pdf_info_dict:
|
||||
paras_of_layout = page_info.get('para_blocks')
|
||||
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
||||
paras_of_layout, 'nlp')
|
||||
markdown.extend(page_markdown)
|
||||
return '\n\n'.join(markdown)
|
||||
|
||||
|
||||
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
||||
img_buket_path):
|
||||
markdown_with_para_and_pagination = []
|
||||
@@ -76,45 +44,6 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
||||
return markdown_with_para_and_pagination
|
||||
|
||||
|
||||
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
|
||||
page_markdown = []
|
||||
for paras in paras_of_layout:
|
||||
for para in paras:
|
||||
para_text = ''
|
||||
for line in para:
|
||||
for span in line['spans']:
|
||||
span_type = span.get('type')
|
||||
content = ''
|
||||
language = ''
|
||||
if span_type == ContentType.Text:
|
||||
content = span['content']
|
||||
language = detect_lang(content)
|
||||
if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
|
||||
content = ocr_escape_special_markdown_char(
|
||||
split_long_words(content))
|
||||
else:
|
||||
content = ocr_escape_special_markdown_char(content)
|
||||
elif span_type == ContentType.InlineEquation:
|
||||
content = f"${span['content']}$"
|
||||
elif span_type == ContentType.InterlineEquation:
|
||||
content = f"\n$$\n{span['content']}\n$$\n"
|
||||
elif span_type in [ContentType.Image, ContentType.Table]:
|
||||
if mode == 'mm':
|
||||
content = f"\n})\n"
|
||||
elif mode == 'nlp':
|
||||
pass
|
||||
if content != '':
|
||||
if language == 'en': # 英文语境下 content间需要空格分隔
|
||||
para_text += content + ' '
|
||||
else: # 中文语境下,content间不需要空格分隔
|
||||
para_text += content
|
||||
if para_text.strip() == '':
|
||||
continue
|
||||
else:
|
||||
page_markdown.append(para_text.strip() + ' ')
|
||||
return page_markdown
|
||||
|
||||
|
||||
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
||||
mode,
|
||||
img_buket_path='',
|
||||
@@ -207,21 +136,11 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
|
||||
if line_text != '':
|
||||
line_lang = detect_lang(line_text)
|
||||
for span in line['spans']:
|
||||
|
||||
span_type = span['type']
|
||||
content = ''
|
||||
if span_type == ContentType.Text:
|
||||
content = span['content']
|
||||
# language = detect_lang(content)
|
||||
language = detect_language(content)
|
||||
# 判断是否小语种
|
||||
if lang is not None and lang != 'en':
|
||||
content = ocr_escape_special_markdown_char(content)
|
||||
else: # 非小语种逻辑
|
||||
if language == 'en' and parse_type == 'ocr': # 只对英文长词进行分词处理,中文分词会丢失文本
|
||||
content = ocr_escape_special_markdown_char(
|
||||
split_long_words(content))
|
||||
else:
|
||||
content = ocr_escape_special_markdown_char(content)
|
||||
content = ocr_escape_special_markdown_char(span['content'])
|
||||
elif span_type == ContentType.InlineEquation:
|
||||
content = f" ${span['content']}$ "
|
||||
elif span_type == ContentType.InterlineEquation:
|
||||
@@ -242,41 +161,6 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
|
||||
return para_text
|
||||
|
||||
|
||||
def para_to_standard_format(para, img_buket_path):
|
||||
para_content = {}
|
||||
if len(para) == 1:
|
||||
para_content = line_to_standard_format(para[0], img_buket_path)
|
||||
elif len(para) > 1:
|
||||
para_text = ''
|
||||
inline_equation_num = 0
|
||||
for line in para:
|
||||
for span in line['spans']:
|
||||
language = ''
|
||||
span_type = span.get('type')
|
||||
content = ''
|
||||
if span_type == ContentType.Text:
|
||||
content = span['content']
|
||||
language = detect_lang(content)
|
||||
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
||||
content = ocr_escape_special_markdown_char(
|
||||
split_long_words(content))
|
||||
else:
|
||||
content = ocr_escape_special_markdown_char(content)
|
||||
elif span_type == ContentType.InlineEquation:
|
||||
content = f"${span['content']}$"
|
||||
inline_equation_num += 1
|
||||
if language == 'en': # 英文语境下 content间需要空格分隔
|
||||
para_text += content + ' '
|
||||
else: # 中文语境下,content间不需要空格分隔
|
||||
para_text += content
|
||||
para_content = {
|
||||
'type': 'text',
|
||||
'text': para_text,
|
||||
'inline_equation_num': inline_equation_num,
|
||||
}
|
||||
return para_content
|
||||
|
||||
|
||||
def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None):
|
||||
para_type = para_block['type']
|
||||
para_content = {}
|
||||
@@ -330,82 +214,6 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
|
||||
return para_content
|
||||
|
||||
|
||||
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
|
||||
content_list = []
|
||||
for page_info in pdf_info_dict:
|
||||
paras_of_layout = page_info.get('para_blocks')
|
||||
if not paras_of_layout:
|
||||
continue
|
||||
for para_block in paras_of_layout:
|
||||
para_content = para_to_standard_format_v2(para_block,
|
||||
img_buket_path)
|
||||
content_list.append(para_content)
|
||||
return content_list
|
||||
|
||||
|
||||
def line_to_standard_format(line, img_buket_path):
|
||||
line_text = ''
|
||||
inline_equation_num = 0
|
||||
for span in line['spans']:
|
||||
if not span.get('content'):
|
||||
if not span.get('image_path'):
|
||||
continue
|
||||
else:
|
||||
if span['type'] == ContentType.Image:
|
||||
content = {
|
||||
'type': 'image',
|
||||
'img_path': join_path(img_buket_path,
|
||||
span['image_path']),
|
||||
}
|
||||
return content
|
||||
elif span['type'] == ContentType.Table:
|
||||
content = {
|
||||
'type': 'table',
|
||||
'img_path': join_path(img_buket_path,
|
||||
span['image_path']),
|
||||
}
|
||||
return content
|
||||
else:
|
||||
if span['type'] == ContentType.InterlineEquation:
|
||||
interline_equation = span['content']
|
||||
content = {
|
||||
'type': 'equation',
|
||||
'latex': f'$$\n{interline_equation}\n$$'
|
||||
}
|
||||
return content
|
||||
elif span['type'] == ContentType.InlineEquation:
|
||||
inline_equation = span['content']
|
||||
line_text += f'${inline_equation}$'
|
||||
inline_equation_num += 1
|
||||
elif span['type'] == ContentType.Text:
|
||||
text_content = ocr_escape_special_markdown_char(
|
||||
span['content']) # 转义特殊符号
|
||||
line_text += text_content
|
||||
content = {
|
||||
'type': 'text',
|
||||
'text': line_text,
|
||||
'inline_equation_num': inline_equation_num,
|
||||
}
|
||||
return content
|
||||
|
||||
|
||||
def ocr_mk_mm_standard_format(pdf_info_dict: list):
|
||||
"""content_list type string
|
||||
image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
|
||||
latex文本字段。 text string 纯文本格式的文本数据。 md string
|
||||
markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
|
||||
content_list = []
|
||||
for page_info in pdf_info_dict:
|
||||
blocks = page_info.get('preproc_blocks')
|
||||
if not blocks:
|
||||
continue
|
||||
for block in blocks:
|
||||
for line in block['lines']:
|
||||
content = line_to_standard_format(line)
|
||||
content_list.append(content)
|
||||
return content_list
|
||||
|
||||
|
||||
def union_make(pdf_info_dict: list,
|
||||
make_mode: str,
|
||||
drop_mode: str,
|
||||
|
||||
@@ -77,11 +77,11 @@ def layout_model_init(weight, config_file, device):
|
||||
return model
|
||||
|
||||
|
||||
def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None):
|
||||
def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=2.4):
|
||||
if lang is not None:
|
||||
model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang)
|
||||
model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
|
||||
else:
|
||||
model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
|
||||
model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
|
||||
return model
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ PyMuPDF>=1.24.9
|
||||
loguru>=0.6.0
|
||||
numpy>=1.21.6,<2.0.0
|
||||
fast-langdetect==0.2.0
|
||||
wordninja>=2.0.0
|
||||
scikit-learn>=1.0.2
|
||||
pdfminer.six==20231228
|
||||
unimernet==0.2.1
|
||||
|
||||
@@ -8,7 +8,6 @@ pdfminer.six==20231228
|
||||
pydantic>=2.7.2,<2.8.0
|
||||
PyMuPDF>=1.24.9
|
||||
scikit-learn>=1.0.2
|
||||
wordninja>=2.0.0
|
||||
torch>=2.2.2,<=2.3.1
|
||||
transformers
|
||||
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
|
||||
|
||||
Reference in New Issue
Block a user