修复中文语境下长文本因分词导致文本丢失和content间被增加额外空格的问题

This commit is contained in:
赵小蒙
2024-04-08 11:14:29 +08:00
parent 696906ed02
commit 05fe0548b1

View File

@@ -1,4 +1,5 @@
from magic_pdf.libs.commons import s3_image_save_path, join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType
import wordninja
@@ -108,8 +109,14 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
for span in line['spans']:
span_type = span.get('type')
content = ''
language = ''
if span_type == ContentType.Text:
content = ocr_escape_special_markdown_char(split_long_words(span['content']))
content = span['content']
language = detect_lang(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
content = f"${ocr_escape_special_markdown_char(span['content'])}$"
elif span_type == ContentType.InterlineEquation:
@@ -120,7 +127,10 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
elif mode == 'nlp':
pass
if content != '':
para_text += content + ' '
if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' '
else: # 中文语境下content间不需要空格分隔
para_text += content
if para_text.strip() == '':
continue
else: