From 05fe0548b1b5487347ee755f65ae098bb5378daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Mon, 8 Apr 2024 11:14:29 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=B8=AD=E6=96=87=E8=AF=AD?= =?UTF-8?q?=E5=A2=83=E4=B8=8B=E9=95=BF=E6=96=87=E6=9C=AC=E5=9B=A0=E5=88=86?= =?UTF-8?q?=E8=AF=8D=E5=AF=BC=E8=87=B4=E6=96=87=E6=9C=AC=E4=B8=A2=E5=A4=B1?= =?UTF-8?q?=E5=92=8Ccontent=E9=97=B4=E8=A2=AB=E5=A2=9E=E5=8A=A0=E9=A2=9D?= =?UTF-8?q?=E5=A4=96=E7=A9=BA=E6=A0=BC=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- magic_pdf/dict2md/ocr_mkcontent.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index 1838c0ca..d186f0e7 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -1,4 +1,5 @@ from magic_pdf.libs.commons import s3_image_save_path, join_path +from magic_pdf.libs.language import detect_lang from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.ocr_content_type import ContentType import wordninja @@ -108,8 +109,14 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode): for span in line['spans']: span_type = span.get('type') content = '' + language = '' if span_type == ContentType.Text: - content = ocr_escape_special_markdown_char(split_long_words(span['content'])) + content = span['content'] + language = detect_lang(content) + if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 + content = ocr_escape_special_markdown_char(split_long_words(content)) + else: + content = ocr_escape_special_markdown_char(content) elif span_type == ContentType.InlineEquation: content = f"${ocr_escape_special_markdown_char(span['content'])}$" elif span_type == ContentType.InterlineEquation: @@ -120,7 +127,10 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode): elif mode == 'nlp': pass if content != '': - para_text += content + ' ' + if language == 'en': # 英文语境下 content间需要空格分隔 + para_text += content + ' ' + else: # 中文语境下,content间不需要空格分隔 + para_text += content if para_text.strip() == '': continue else: