make多模态markdown时图片地址更改为fullpath

make markdown时特殊符号转义
2026-04-12 07:06:44 +07:00 · 2024-03-15 14:01:23 +08:00 · 2024-03-14 18:49:54 +08:00
2 changed files with 23 additions and 3 deletions
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -1,3 +1,4 @@
+from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import ContentType


@@ -14,7 +15,7 @@ def ocr_mk_nlp_markdown(pdf_info_dict: dict):
                for span in line['spans']:
                    if not span.get('content'):
                        continue
-                    content = span['content'].replace('$', '\$')  # 转义$
+                    content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                    if span['type'] == ContentType.InlineEquation:
                        content = f"${content}$"
                    elif span['type'] == ContentType.InterlineEquation:
@@ -41,9 +42,9 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
                        if not span.get('image_path'):
                            continue
                        else:
-                            content = f"![]({span['image_path']})"
+                            content = f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']})"
                    else:
-                        content = span['content'].replace('$', '\$')  # 转义$
+                        content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                        if span['type'] == ContentType.InlineEquation:
                            content = f"${content}$"
                        elif span['type'] == ContentType.InterlineEquation:
@@ -52,3 +53,11 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
+
+def ocr_mk_mm_standard_format():
+    '''
+    content_list
+    type string image/text/table/equation(行间的单独拿出来，行内的和text合并)
+
+    '''
+    pass
--- a/magic_pdf/libs/markdown_utils.py
+++ b/magic_pdf/libs/markdown_utils.py
@@ -18,3 +18,14 @@ def escape_special_markdown_char(pymu_blocks):
                        span['text'] = span['text'].replace(char, "\\" + char)

    return pymu_blocks
+
+
+def ocr_escape_special_markdown_char(content):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for char in special_chars:
+        content = content.replace(char, "\\" + char)
+
+    return content
Author	SHA1	Message	Date
赵小蒙	f06a32133c	make多模态markdown时图片地址更改为fullpath	2024-03-15 14:01:23 +08:00
赵小蒙	59b0b0c3da	make markdown时特殊符号转义	2024-03-14 18:49:54 +08:00