mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-04-12 07:06:44 +07:00
Compare commits
2 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f06a32133c | ||
|
|
59b0b0c3da |
@@ -1,3 +1,4 @@
|
||||
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
|
||||
|
||||
@@ -14,7 +15,7 @@ def ocr_mk_nlp_markdown(pdf_info_dict: dict):
|
||||
for span in line['spans']:
|
||||
if not span.get('content'):
|
||||
continue
|
||||
content = span['content'].replace('$', '\$') # 转义$
|
||||
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
|
||||
if span['type'] == ContentType.InlineEquation:
|
||||
content = f"${content}$"
|
||||
elif span['type'] == ContentType.InterlineEquation:
|
||||
@@ -41,9 +42,9 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
|
||||
if not span.get('image_path'):
|
||||
continue
|
||||
else:
|
||||
content = f""
|
||||
content = f""
|
||||
else:
|
||||
content = span['content'].replace('$', '\$') # 转义$
|
||||
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
|
||||
if span['type'] == ContentType.InlineEquation:
|
||||
content = f"${content}$"
|
||||
elif span['type'] == ContentType.InterlineEquation:
|
||||
@@ -52,3 +53,11 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
|
||||
# 在行末添加两个空格以强制换行
|
||||
markdown.append(line_text.strip() + ' ')
|
||||
return '\n'.join(markdown)
|
||||
|
||||
def ocr_mk_mm_standard_format():
|
||||
'''
|
||||
content_list
|
||||
type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
|
||||
|
||||
'''
|
||||
pass
|
||||
@@ -18,3 +18,14 @@ def escape_special_markdown_char(pymu_blocks):
|
||||
span['text'] = span['text'].replace(char, "\\" + char)
|
||||
|
||||
return pymu_blocks
|
||||
|
||||
|
||||
def ocr_escape_special_markdown_char(content):
|
||||
"""
|
||||
转义正文里对markdown语法有特殊意义的字符
|
||||
"""
|
||||
special_chars = ["*", "`", "~", "$"]
|
||||
for char in special_chars:
|
||||
content = content.replace(char, "\\" + char)
|
||||
|
||||
return content
|
||||
|
||||
Reference in New Issue
Block a user