修复spans为空list导致的IndexError: list index out of range

s3_image_save_path统一配置
2026-03-27 11:08:32 +07:00 · 2024-03-15 16:53:29 +08:00 · 2024-03-15 16:45:43 +08:00
5 changed files with 95 additions and 86 deletions
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -1,3 +1,4 @@
+from magic_pdf.libs.commons import s3_image_save_path, join_path
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import ContentType

@@ -42,7 +43,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
                        if not span.get('image_path'):
                            continue
                        else:
-                            content = f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']})"
+                            content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
                    else:
                        content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                        if span['type'] == ContentType.InlineEquation:
@@ -73,7 +74,7 @@ def mk_mm_markdown2(pdf_info_dict:dict):
                    elif span_type == ContentType.InterlineEquation:
                        para_text += f"$$\n{span['content']}\n$$ "
                    elif span_type == ContentType.Image:
-                        para_text += f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']}) "
+                        para_text += f"![]({join_path(s3_image_save_path, span['image_path'])})"
            markdown.append(para_text)

    return '\n\n'.join(markdown)
--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
@@ -24,6 +24,8 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
 # json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
 json_dump_path = "s3://llm-pdf-text/json_dump/"

+s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/"
+

 def get_top_percent_list(num_list, percent):
    """
--- a/magic_pdf/pipeline.py
+++ b/magic_pdf/pipeline.py
@@ -4,7 +4,7 @@ import time
 from urllib.parse import quote

 from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
-from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time
+from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time, s3_image_save_path
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.dict2md.mkcontent import mk_nlp_markdown
@@ -287,7 +287,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
    #     jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
    else:
        try:
-            save_path = "s3://mllm-raw-media/pdf2md_img/"
+            save_path = s3_image_save_path
            image_s3_config = get_s3_config(save_path)
            start_time = time.time()  # 记录开始时间
            # 先打印一下book_name和解析开始的时间
@@ -328,7 +328,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
    file_id = jso.get('file_id')
    book_name = f"{data_source}/{file_id}"
    try:
-        save_path = "s3://mllm-raw-media/pdf2md_img/"
+        save_path = s3_image_save_path
        image_s3_config = get_s3_config(save_path)
        start_time = time.time()  # 记录开始时间
        # 先打印一下book_name和解析开始的时间
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -24,34 +24,37 @@ def line_sort_spans_by_left_to_right(lines):
    return line_objects

 def merge_spans_to_line(spans):
-    # 按照y0坐标排序
-    spans.sort(key=lambda span: span['bbox'][1])
+    if len(spans) == 0:
+        return []
+    else:
+        # 按照y0坐标排序
+        spans.sort(key=lambda span: span['bbox'][1])

-    lines = []
-    current_line = [spans[0]]
-    for span in spans[1:]:
-        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
-        # image和table类型，同上
-        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
-                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
-            # 则开始新行
+        lines = []
+        current_line = [spans[0]]
+        for span in spans[1:]:
+            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
+            # image和table类型，同上
+            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
+                # 则开始新行
+                lines.append(current_line)
+                current_line = [span]
+                continue
+
+            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+                current_line.append(span)
+            else:
+                # 否则，开始新行
+                lines.append(current_line)
+                current_line = [span]
+
+        # 添加最后一行
+        if current_line:
            lines.append(current_line)
-            current_line = [span]
-            continue

-        # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
-        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-            current_line.append(span)
-        else:
-            # 否则，开始新行
-            lines.append(current_line)
-            current_line = [span]
-
-    # 添加最后一行
-    if current_line:
-        lines.append(current_line)
-
-    return lines
+        return lines

 def merge_spans_to_line_by_layout(spans, layout_bboxes):
    lines = []
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans):

 def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
    # displayed_list = []
+    # 如果spans为空,则不处理
+    if len(spans) == 0:
+        pass
+    else:
+        spans.sort(key=lambda span: span['bbox'][1])

-    spans.sort(key=lambda span: span['bbox'][1])
+        lines = []
+        current_line = [spans[0]]
+        if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            displayed_list.append(spans[0])

-    lines = []
-    current_line = [spans[0]]
-    if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
-        displayed_list.append(spans[0])
+        line_first_y0 = spans[0]["bbox"][1]
+        line_first_y = spans[0]["bbox"][3]
+        # 用于给行间公式搜索
+        # text_inline_lines = []
+        for span in spans[1:]:
+            # if span.get("content","") == "78.":
+            #     print("debug")
+            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
+            # image和table类型，同上
+            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
+                # 传入
+                if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+                    displayed_list.append(span)
+                # 则开始新行
+                lines.append(current_line)
+                if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+                    text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+                current_line = [span]
+                line_first_y0 = span["bbox"][1]
+                line_first_y = span["bbox"][3]
+                continue

-    line_first_y0 = spans[0]["bbox"][1]
-    line_first_y = spans[0]["bbox"][3]
-    # 用于给行间公式搜索
-    # text_inline_lines = []
-    for span in spans[1:]:
-        # if span.get("content","") == "78.":
-        #     print("debug")
-        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
-        # image和table类型，同上
-        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
-                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
-            # 传入
-            if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
-                displayed_list.append(span)
-            # 则开始新行
+            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+                if span["type"] == "text":
+                    line_first_y0 = span["bbox"][1]
+                    line_first_y = span["bbox"][3]
+                current_line.append(span)
+
+            else:
+                # 否则，开始新行
+                lines.append(current_line)
+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+                current_line = [span]
+                line_first_y0 = span["bbox"][1]
+                line_first_y = span["bbox"][3]
+
+            # 添加最后一行
+        if current_line:
            lines.append(current_line)
            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-            current_line = [span]
-            line_first_y0 = span["bbox"][1]
-            line_first_y = span["bbox"][3]
-            continue
+        for line in text_inline_lines:
+            # 按照x0坐标排序
+            current_line = line[0]
+            current_line.sort(key=lambda span: span['bbox'][0])

-        # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
-        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-            if span["type"] == "text":
-                line_first_y0 = span["bbox"][1]
-                line_first_y = span["bbox"][3]
-            current_line.append(span)
+        # 调整每一个文字行内bbox统一
+        for line in text_inline_lines:
+            current_line, (line_first_y0, line_first_y) = line
+            for span in current_line:
+                span["bbox"][1] = line_first_y0
+                span["bbox"][3] = line_first_y

-        else:
-            # 否则，开始新行
-            lines.append(current_line)
-            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-            current_line = [span]
-            line_first_y0 = span["bbox"][1]
-            line_first_y = span["bbox"][3]
-
-        # 添加最后一行
-    if current_line:
-        lines.append(current_line)
-        if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
-            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
-    for line in text_inline_lines:
-        # 按照x0坐标排序
-        current_line = line[0]
-        current_line.sort(key=lambda span: span['bbox'][0])
-
-    # 调整每一个文字行内bbox统一
-    for line in text_inline_lines:
-        current_line, (line_first_y0, line_first_y) = line
-        for span in current_line:
-            span["bbox"][1] = line_first_y0
-            span["bbox"][3] = line_first_y
-
-    # return spans, displayed_list, text_inline_lines
+        # return spans, displayed_list, text_inline_lines


 def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
Author	SHA1	Message	Date
赵小蒙	a01356400e	修复spans为空list导致的IndexError: list index out of range	2024-03-15 16:53:29 +08:00
赵小蒙	f10b4a501f	s3_image_save_path统一配置	2024-03-15 16:45:43 +08:00