refactor: add full-to-half character conversion and hyphen handling in merge_para_with_text function

2026-03-27 11:08:32 +07:00 · 2025-12-18 11:54:51 +08:00
parent 1ab0c0f68c
commit 6fca3d3df1
1 changed files with 69 additions and 5 deletions
--- a/mineru/backend/vlm/vlm_middle_json_mkcontent.py
+++ b/mineru/backend/vlm/vlm_middle_json_mkcontent.py
@@ -1,8 +1,10 @@
 import os
+import re

 from loguru import logger
 from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
 from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
+from mineru.utils.language import detect_lang

 latex_delimiters_config = get_latex_delimiter_config()

@@ -18,7 +20,49 @@ display_right_delimiter = delimiters['display']['right']
 inline_left_delimiter = delimiters['inline']['left']
 inline_right_delimiter = delimiters['inline']['right']

+
+def full_to_half(text: str) -> str:
+    """Convert full-width characters to half-width characters using code point manipulation.
+
+    Args:
+        text: String containing full-width characters
+
+    Returns:
+        String with full-width characters converted to half-width
+    """
+    result = []
+    for char in text:
+        code = ord(char)
+        # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
+        if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
+            result.append(chr(code - 0xFEE0))  # Shift to ASCII range
+        else:
+            result.append(char)
+    return ''.join(result)
+
+
+def __is_hyphen_at_line_end(line):
+    """Check if a line ends with one or more letters followed by a hyphen.
+
+    Args:
+    line (str): The line of text to check.
+
+    Returns:
+    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
+    """
+    # Use regex to check if the line ends with one or more letters followed by a hyphen
+    return bool(re.search(r'[A-Za-z]+-\s*$', line))
+
+
 def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
+    block_text = ''
+    for line in para_block['lines']:
+        for span in line['spans']:
+            if span['type'] in [ContentType.TEXT]:
+                span['content'] = full_to_half(span['content'])
+                block_text += span['content']
+    block_lang = detect_lang(block_text)
+
    para_text = ''
    for line in para_block['lines']:
        for j, span in enumerate(line['spans']):
@@ -34,17 +78,37 @@ def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
                else:
                    if span.get('image_path', ''):
                        content = f"![]({img_buket_path}/{span['image_path']})"
-            # content = content.strip()
+            # if content:
+            #     if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
+            #         if j == len(line['spans']) - 1:
+            #             para_text += content
+            #         else:
+            #             para_text += f'{content} '
+            #     elif span_type == ContentType.INTERLINE_EQUATION:
+            #         para_text += content
+            content = content.strip()
+
            if content:
-                if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
-                    if j == len(line['spans']) - 1:
+                langs = ['zh', 'ja', 'ko']
+                # logger.info(f'block_lang: {block_lang}, content: {content}')
+                if block_lang in langs:  # 中文/日语/韩文语境下，换行不需要空格分隔,但是如果是行内公式结尾，还是要加空格
+                    if j == len(line['spans']) - 1 and span_type not in [ContentType.INLINE_EQUATION]:
                        para_text += content
                    else:
                        para_text += f'{content} '
-                elif span_type == ContentType.INTERLINE_EQUATION:
-                    para_text += content
+                else:
+                    if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
+                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
+                        if j == len(line['spans']) - 1 and span_type == ContentType.TEXT and __is_hyphen_at_line_end(
+                                content):
+                            para_text += content[:-1]
+                        else:  # 西方文本语境下 content间需要空格分隔
+                            para_text += f'{content} '
+                    elif span_type == ContentType.INTERLINE_EQUATION:
+                        para_text += content
    return para_text

+
 def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
    page_markdown = []
    for para_block in para_blocks: