修复中文语境下长文本因分词导致文本丢失和content间被增加额外空格的问题

2026-03-27 11:08:32 +07:00 · 2024-04-08 11:14:29 +08:00
parent 696906ed02
commit 05fe0548b1
1 changed files with 12 additions and 2 deletions
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import s3_image_save_path, join_path
+from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import ContentType
 import wordninja
@@ -108,8 +109,14 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
                for span in line['spans']:
                    span_type = span.get('type')
                    content = ''
+                    language = ''
                    if span_type == ContentType.Text:
-                        content = ocr_escape_special_markdown_char(split_long_words(span['content']))
+                        content = span['content']
+                        language = detect_lang(content)
+                        if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
+                            content = ocr_escape_special_markdown_char(split_long_words(content))
+                        else:
+                            content = ocr_escape_special_markdown_char(content)
                    elif span_type == ContentType.InlineEquation:
                        content = f"${ocr_escape_special_markdown_char(span['content'])}$"
                    elif span_type == ContentType.InterlineEquation:
@@ -120,7 +127,10 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
                        elif mode == 'nlp':
                            pass
                    if content != '':
-                        para_text += content + ' '
+                        if language == 'en':  # 英文语境下 content间需要空格分隔
+                            para_text += content + ' '
+                        else:  # 中文语境下，content间不需要空格分隔
+                            para_text += content
            if para_text.strip() == '':
                continue
            else: