Compare commits

..

2 Commits

Author SHA1 Message Date
赵小蒙
f06a32133c make多模态markdown时图片地址更改为fullpath 2024-03-15 14:01:23 +08:00
赵小蒙
59b0b0c3da make markdown时特殊符号转义 2024-03-14 18:49:54 +08:00
2 changed files with 23 additions and 3 deletions

View File

@@ -1,3 +1,4 @@
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType
@@ -14,7 +15,7 @@ def ocr_mk_nlp_markdown(pdf_info_dict: dict):
for span in line['spans']:
if not span.get('content'):
continue
content = span['content'].replace('$', '\$') # 转义$
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
content = f"${content}$"
elif span['type'] == ContentType.InterlineEquation:
@@ -41,9 +42,9 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
if not span.get('image_path'):
continue
else:
content = f"![]({span['image_path']})"
content = f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']})"
else:
content = span['content'].replace('$', '\$') # 转义$
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
content = f"${content}$"
elif span['type'] == ContentType.InterlineEquation:
@@ -52,3 +53,11 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
# 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown)
def ocr_mk_mm_standard_format():
'''
content_list
type string image/text/table/equation(行间的单独拿出来行内的和text合并)
'''
pass

View File

@@ -18,3 +18,14 @@ def escape_special_markdown_char(pymu_blocks):
span['text'] = span['text'].replace(char, "\\" + char)
return pymu_blocks
def ocr_escape_special_markdown_char(content):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars = ["*", "`", "~", "$"]
for char in special_chars:
content = content.replace(char, "\\" + char)
return content