mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
修复中文语境下长文本因分词导致文本丢失和content间被增加额外空格的问题
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
from magic_pdf.libs.commons import s3_image_save_path, join_path
|
||||
from magic_pdf.libs.language import detect_lang
|
||||
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
import wordninja
|
||||
@@ -108,8 +109,14 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
|
||||
for span in line['spans']:
|
||||
span_type = span.get('type')
|
||||
content = ''
|
||||
language = ''
|
||||
if span_type == ContentType.Text:
|
||||
content = ocr_escape_special_markdown_char(split_long_words(span['content']))
|
||||
content = span['content']
|
||||
language = detect_lang(content)
|
||||
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
||||
content = ocr_escape_special_markdown_char(split_long_words(content))
|
||||
else:
|
||||
content = ocr_escape_special_markdown_char(content)
|
||||
elif span_type == ContentType.InlineEquation:
|
||||
content = f"${ocr_escape_special_markdown_char(span['content'])}$"
|
||||
elif span_type == ContentType.InterlineEquation:
|
||||
@@ -120,7 +127,10 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
|
||||
elif mode == 'nlp':
|
||||
pass
|
||||
if content != '':
|
||||
para_text += content + ' '
|
||||
if language == 'en': # 英文语境下 content间需要空格分隔
|
||||
para_text += content + ' '
|
||||
else: # 中文语境下,content间不需要空格分隔
|
||||
para_text += content
|
||||
if para_text.strip() == '':
|
||||
continue
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user