mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
fix(merge_text): add ligature replacement functionality
- Implement __replace_ligatures function to split ligature characters- Integrate ligature replacement into the merge_para_with_text function - Handle common ligatures such as fi, fl, ff, ffi, and ffl
This commit is contained in:
@@ -119,6 +119,16 @@ def detect_language(text):
|
||||
return 'empty'
|
||||
|
||||
|
||||
# 连写字符拆分
|
||||
def __replace_ligatures(text: str):
|
||||
text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
|
||||
text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
|
||||
text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
|
||||
text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
|
||||
text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
|
||||
return text
|
||||
|
||||
|
||||
def merge_para_with_text(para_block):
|
||||
para_text = ''
|
||||
for i, line in enumerate(para_block['lines']):
|
||||
@@ -166,6 +176,8 @@ def merge_para_with_text(para_block):
|
||||
para_text += content
|
||||
else:
|
||||
continue
|
||||
# 连写字符拆分
|
||||
para_text = __replace_ligatures(para_text)
|
||||
|
||||
return para_text
|
||||
|
||||
|
||||
Reference in New Issue
Block a user