refactor: add full-to-half character conversion and hyphen handling in merge_para_with_text function

This commit is contained in:
myhloli
2025-12-18 11:54:51 +08:00
parent 1ab0c0f68c
commit 6fca3d3df1

View File

@@ -1,8 +1,10 @@
import os
import re
from loguru import logger
from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
from mineru.utils.language import detect_lang
latex_delimiters_config = get_latex_delimiter_config()
@@ -18,7 +20,49 @@ display_right_delimiter = delimiters['display']['right']
inline_left_delimiter = delimiters['inline']['left']
inline_right_delimiter = delimiters['inline']['right']
def full_to_half(text: str) -> str:
"""Convert full-width characters to half-width characters using code point manipulation.
Args:
text: String containing full-width characters
Returns:
String with full-width characters converted to half-width
"""
result = []
for char in text:
code = ord(char)
# Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
else:
result.append(char)
return ''.join(result)
def __is_hyphen_at_line_end(line):
"""Check if a line ends with one or more letters followed by a hyphen.
Args:
line (str): The line of text to check.
Returns:
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
"""
# Use regex to check if the line ends with one or more letters followed by a hyphen
return bool(re.search(r'[A-Za-z]+-\s*$', line))
def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
block_text = ''
for line in para_block['lines']:
for span in line['spans']:
if span['type'] in [ContentType.TEXT]:
span['content'] = full_to_half(span['content'])
block_text += span['content']
block_lang = detect_lang(block_text)
para_text = ''
for line in para_block['lines']:
for j, span in enumerate(line['spans']):
@@ -34,17 +78,37 @@ def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
else:
if span.get('image_path', ''):
content = f"![]({img_buket_path}/{span['image_path']})"
# content = content.strip()
# if content:
# if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
# if j == len(line['spans']) - 1:
# para_text += content
# else:
# para_text += f'{content} '
# elif span_type == ContentType.INTERLINE_EQUATION:
# para_text += content
content = content.strip()
if content:
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
if j == len(line['spans']) - 1:
langs = ['zh', 'ja', 'ko']
# logger.info(f'block_lang: {block_lang}, content: {content}')
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
if j == len(line['spans']) - 1 and span_type not in [ContentType.INLINE_EQUATION]:
para_text += content
else:
para_text += f'{content} '
elif span_type == ContentType.INTERLINE_EQUATION:
para_text += content
else:
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans']) - 1 and span_type == ContentType.TEXT and __is_hyphen_at_line_end(
content):
para_text += content[:-1]
else: # 西方文本语境下 content间需要空格分隔
para_text += f'{content} '
elif span_type == ContentType.INTERLINE_EQUATION:
para_text += content
return para_text
def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
page_markdown = []
for para_block in para_blocks: