mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
refactor: add full-to-half character conversion and hyphen handling in merge_para_with_text function
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
from loguru import logger
|
||||
from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
|
||||
from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
|
||||
from mineru.utils.language import detect_lang
|
||||
|
||||
latex_delimiters_config = get_latex_delimiter_config()
|
||||
|
||||
@@ -18,7 +20,49 @@ display_right_delimiter = delimiters['display']['right']
|
||||
inline_left_delimiter = delimiters['inline']['left']
|
||||
inline_right_delimiter = delimiters['inline']['right']
|
||||
|
||||
|
||||
def full_to_half(text: str) -> str:
|
||||
"""Convert full-width characters to half-width characters using code point manipulation.
|
||||
|
||||
Args:
|
||||
text: String containing full-width characters
|
||||
|
||||
Returns:
|
||||
String with full-width characters converted to half-width
|
||||
"""
|
||||
result = []
|
||||
for char in text:
|
||||
code = ord(char)
|
||||
# Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
|
||||
if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
|
||||
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
|
||||
else:
|
||||
result.append(char)
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def __is_hyphen_at_line_end(line):
|
||||
"""Check if a line ends with one or more letters followed by a hyphen.
|
||||
|
||||
Args:
|
||||
line (str): The line of text to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
|
||||
"""
|
||||
# Use regex to check if the line ends with one or more letters followed by a hyphen
|
||||
return bool(re.search(r'[A-Za-z]+-\s*$', line))
|
||||
|
||||
|
||||
def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
|
||||
block_text = ''
|
||||
for line in para_block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] in [ContentType.TEXT]:
|
||||
span['content'] = full_to_half(span['content'])
|
||||
block_text += span['content']
|
||||
block_lang = detect_lang(block_text)
|
||||
|
||||
para_text = ''
|
||||
for line in para_block['lines']:
|
||||
for j, span in enumerate(line['spans']):
|
||||
@@ -34,17 +78,37 @@ def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
|
||||
else:
|
||||
if span.get('image_path', ''):
|
||||
content = f""
|
||||
# content = content.strip()
|
||||
# if content:
|
||||
# if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
|
||||
# if j == len(line['spans']) - 1:
|
||||
# para_text += content
|
||||
# else:
|
||||
# para_text += f'{content} '
|
||||
# elif span_type == ContentType.INTERLINE_EQUATION:
|
||||
# para_text += content
|
||||
content = content.strip()
|
||||
|
||||
if content:
|
||||
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
|
||||
if j == len(line['spans']) - 1:
|
||||
langs = ['zh', 'ja', 'ko']
|
||||
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
||||
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
|
||||
if j == len(line['spans']) - 1 and span_type not in [ContentType.INLINE_EQUATION]:
|
||||
para_text += content
|
||||
else:
|
||||
para_text += f'{content} '
|
||||
elif span_type == ContentType.INTERLINE_EQUATION:
|
||||
para_text += content
|
||||
else:
|
||||
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
|
||||
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
||||
if j == len(line['spans']) - 1 and span_type == ContentType.TEXT and __is_hyphen_at_line_end(
|
||||
content):
|
||||
para_text += content[:-1]
|
||||
else: # 西方文本语境下 content间需要空格分隔
|
||||
para_text += f'{content} '
|
||||
elif span_type == ContentType.INTERLINE_EQUATION:
|
||||
para_text += content
|
||||
return para_text
|
||||
|
||||
|
||||
def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
|
||||
page_markdown = []
|
||||
for para_block in para_blocks:
|
||||
|
||||
Reference in New Issue
Block a user