feat: implement content list v2 generation and update content types

2026-03-27 11:08:32 +07:00 · 2025-12-09 17:38:52 +08:00
parent abcd27ad88
commit ff2d81ad5f
3 changed files with 316 additions and 2 deletions
--- a/mineru/backend/vlm/vlm_middle_json_mkcontent.py
+++ b/mineru/backend/vlm/vlm_middle_json_mkcontent.py
@@ -1,7 +1,8 @@
 import os

+from loguru import logger
 from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
-from mineru.utils.enum_class import MakeMode, BlockType, ContentType
+from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2

 latex_delimiters_config = get_latex_delimiter_config()

@@ -234,6 +235,277 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)

    return para_content

+
+def make_blocks_to_content_list_v2(para_block, img_buket_path, page_size):
+    para_type = para_block['type']
+    para_content = {}
+    if para_type in [
+        BlockType.HEADER,
+        BlockType.FOOTER,
+        BlockType.ASIDE_TEXT,
+        BlockType.PAGE_NUMBER,
+        BlockType.PAGE_FOOTNOTE,
+    ]:
+        if para_type == BlockType.HEADER:
+            content_type = ContentTypeV2.PAGE_HEADER
+        elif para_type == BlockType.FOOTER:
+            content_type = ContentTypeV2.PAGE_FOOTER
+        elif para_type == BlockType.ASIDE_TEXT:
+            content_type = ContentTypeV2.PAGE_ASIDE_TEXT
+        elif para_type == BlockType.PAGE_NUMBER:
+            content_type = ContentTypeV2.PAGE_NUMBER
+        elif para_type == BlockType.PAGE_FOOTNOTE:
+            content_type = ContentTypeV2.PAGE_FOOTNOTE
+        else:
+            raise ValueError(f"Unknown para_type: {para_type}")
+        para_content = {
+            'type': content_type,
+            'content': {
+                f"{content_type}_content":merge_para_with_text_v2(para_block),
+            }
+        }
+    elif para_type == BlockType.TITLE:
+        title_level = get_title_level(para_block)
+        if title_level != 0:
+            para_content = {
+                'type': ContentTypeV2.TITLE,
+                'content': {
+                    "title_content": merge_para_with_text_v2(para_block),
+                    "level": title_level
+                }
+            }
+        else:
+            para_content = {
+                'type': ContentTypeV2.PARAGRAPH,
+                'content': {
+                    "paragraph_content": merge_para_with_text_v2(para_block),
+                }
+            }
+    elif para_type in [
+        BlockType.TEXT,
+        BlockType.PHONETIC
+    ]:
+        para_content = {
+            'type': ContentTypeV2.PARAGRAPH,
+            'content': {
+                'paragraph_content': merge_para_with_text_v2(para_block),
+            }
+        }
+    elif para_type == BlockType.INTERLINE_EQUATION:
+        image_path, math_content = get_body_data(para_block)
+        para_content = {
+            'type': ContentTypeV2.EQUATION_INTERLINE,
+            'content': {
+                'math_content': math_content,
+                'math_type': 'latex',
+                'image_source': {'path': f"{img_buket_path}/{image_path}"},
+            }
+        }
+    elif para_type == BlockType.IMAGE:
+        image_caption = []
+        image_footnote = []
+        image_path, _ = get_body_data(para_block)
+        image_source = {
+            'path':f"{img_buket_path}/{image_path}",
+        }
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.IMAGE_CAPTION:
+                image_caption.extend(merge_para_with_text_v2(block))
+            if block['type'] == BlockType.IMAGE_FOOTNOTE:
+                image_footnote.extend(merge_para_with_text_v2(block))
+        para_content = {
+            'type': ContentTypeV2.IMAGE,
+            'content': {
+                'image_source': image_source,
+                'image_caption': image_caption,
+                'image_footnote': image_footnote,
+            }
+        }
+    elif para_type == BlockType.TABLE:
+        table_caption = []
+        table_footnote = []
+        image_path, html = get_body_data(para_block)
+        image_source = {
+            'path': f"{img_buket_path}/{image_path}",
+        }
+        if html.count("<table") > 1:
+            table_nest_level = 2
+        else:
+            table_nest_level = 1
+        if (
+                "colspan" in html or
+                "rowspan" in html or
+                table_nest_level > 1
+        ):
+            table_type = ContentTypeV2.TABLE_COMPLEX
+        else:
+            table_type = ContentTypeV2.TABLE_SIMPLE
+
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.TABLE_CAPTION:
+                table_caption.extend(merge_para_with_text_v2(block))
+            if block['type'] == BlockType.TABLE_FOOTNOTE:
+                table_footnote.extend(merge_para_with_text_v2(block))
+        para_content = {
+            'type': ContentTypeV2.TABLE,
+            'content': {
+                'image_source': image_source,
+                'table_caption': table_caption,
+                'table_footnote': table_footnote,
+                'html': html,
+                'table_type': table_type,
+                'table_nest_level': table_nest_level,
+            }
+        }
+    elif para_type == BlockType.CODE:
+        code_caption = []
+        code_content = []
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.CODE_CAPTION:
+                code_caption.extend(merge_para_with_text_v2(block))
+            if block['type'] == BlockType.CODE_BODY:
+                code_content = merge_para_with_text_v2(block)
+        sub_type = para_block["sub_type"]
+        if sub_type == BlockType.CODE:
+            para_content = {
+                'type': ContentTypeV2.CODE,
+                'content': {
+                    'code_caption': code_caption,
+                    'code_content': code_content,
+                    'code_language': para_block.get('guess_lang', 'txt'),
+                }
+            }
+        elif sub_type == BlockType.ALGORITHM:
+            para_content = {
+                'type': ContentTypeV2.ALGORITHM,
+                'content': {
+                    'algorithm_caption': code_caption,
+                    'algorithm_content': code_content,
+                }
+            }
+        else:
+            raise ValueError(f"Unknown code sub_type: {sub_type}")
+    elif para_type == BlockType.REF_TEXT:
+        para_content = {
+            'type': ContentTypeV2.LIST,
+            'content': {
+                'list_type': ContentTypeV2.LIST_REF,
+                'list_items': [
+                    {
+                        'item_type': 'text',
+                        'item_content': merge_para_with_text_v2(para_block),
+                    }
+                ],
+            }
+        }
+    elif para_type == BlockType.LIST:
+        if 'sub_type' in para_block:
+            if para_block['sub_type'] == BlockType.REF_TEXT:
+                list_type = ContentTypeV2.LIST_REF
+            elif para_block['sub_type'] == BlockType.TEXT:
+                list_type = ContentTypeV2.LIST_TEXT
+            else:
+                raise ValueError(f"Unknown list sub_type: {para_block['sub_type']}")
+        else:
+            list_type = ContentTypeV2.LIST_TEXT
+        list_items = []
+        for block in para_block['blocks']:
+            item_content = merge_para_with_text_v2(block)
+            if item_content:
+                list_items.append({
+                    'item_type': 'text',
+                    'item_content': item_content,
+                })
+        para_content = {
+            'type': ContentTypeV2.LIST,
+            'content': {
+                'list_type': list_type,
+                'list_items': list_items,
+            }
+        }
+
+    page_width, page_height = page_size
+    para_bbox = para_block.get('bbox')
+    if para_bbox:
+        x0, y0, x1, y1 = para_bbox
+        para_content['bbox'] = [
+            int(x0 * 1000 / page_width),
+            int(y0 * 1000 / page_height),
+            int(x1 * 1000 / page_width),
+            int(y1 * 1000 / page_height),
+        ]
+
+    return para_content
+
+
+
+
+
+def get_body_data(para_block):
+    """
+    Extract image_path and html from para_block
+    Returns:
+        - For IMAGE/INTERLINE_EQUATION: (image_path, '')
+        - For TABLE: (image_path, html)
+        - Default: ('', '')
+    """
+
+    def get_data_from_spans(lines):
+        for line in lines:
+            for span in line.get('spans', []):
+                span_type = span.get('type')
+                if span_type == ContentType.TABLE:
+                    return span.get('image_path', ''), span.get('html', '')
+                elif span_type == ContentType.IMAGE:
+                    return span.get('image_path', ''), ''
+                elif span_type == ContentType.INTERLINE_EQUATION:
+                    return span.get('image_path', ''), span.get('content', '')
+                elif span_type == ContentType.TEXT:
+                    return '', span.get('content', '')
+        return '', ''
+
+    # 处理嵌套的 blocks 结构
+    if 'blocks' in para_block:
+        for block in para_block['blocks']:
+            block_type = block.get('type')
+            if block_type in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.CODE_BODY]:
+                result = get_data_from_spans(block.get('lines', []))
+                if result != ('', ''):
+                    return result
+        return '', ''
+
+    # 处理直接包含 lines 的结构
+    return get_data_from_spans(para_block.get('lines', []))
+
+
+def merge_para_with_text_v2(para_block):
+    para_content = []
+    para_type = para_block['type']
+    for line in para_block['lines']:
+        for span in line['spans']:
+            span_type = span['type']
+            if span['content']:
+                if para_type == BlockType.PHONETIC and span_type == ContentTypeV2.SPAN_TEXT:
+                    span_type = ContentTypeV2.SPAN_PHONETIC
+                if span_type == ContentType.INLINE_EQUATION:
+                    span_type = ContentTypeV2.SPAN_EQUATION_INLINE
+                if span_type in [
+                    ContentTypeV2.SPAN_TEXT,
+                    ContentTypeV2.SPAN_PHONETIC,
+                    ContentTypeV2.SPAN_EQUATION_INLINE,
+                    ContentTypeV2.SPAN_MD,
+                    ContentTypeV2.SPAN_CODE_INLINE,
+                ]:
+                    span_content = {
+                        'type': span_type,
+                        'content': span['content'],
+                    }
+                    para_content.append(span_content)
+                else:
+                    logger.warning(f"Unknown span type in merge_para_with_text_v2: {span_type}")
+    return para_content
+
+
 def union_make(pdf_info_dict: list,
               make_mode: str,
               img_buket_path: str = '',
@@ -260,10 +532,20 @@ def union_make(pdf_info_dict: list,
            for para_block in para_blocks:
                para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                output_content.append(para_content)
+        elif make_mode == MakeMode.CONTENT_LIST_V2:
+            # https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
+            page_contents = []
+            para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
+            if not para_blocks:
+                continue
+            for para_block in para_blocks:
+                para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
+                page_contents.append(para_content)
+            output_content.append(page_contents)

    if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
        return '\n\n'.join(output_content)
-    elif make_mode == MakeMode.CONTENT_LIST:
+    elif make_mode in [MakeMode.CONTENT_LIST, MakeMode.CONTENT_LIST_V2]:
        return output_content
    return None

--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -144,6 +144,12 @@ def _process_output(
            f"{pdf_file_name}_content_list.json",
            json.dumps(content_list, ensure_ascii=False, indent=4),
        )
+        content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
+        md_writer.write_string(
+            f"{pdf_file_name}_content_list_v2.json",
+            json.dumps(content_list_v2, ensure_ascii=False, indent=4),
+        )
+

    if f_dump_middle_json:
        md_writer.write_string(
--- a/mineru/utils/enum_class.py
+++ b/mineru/utils/enum_class.py
@@ -38,6 +38,31 @@ class ContentType:
    CODE = 'code'


+class ContentTypeV2:
+    CODE = 'code'
+    ALGORITHM = "algorithm"
+    EQUATION_INTERLINE = 'equation_interline'
+    IMAGE = 'image'
+    TABLE = 'table'
+    TABLE_SIMPLE = 'simple_table'
+    TABLE_COMPLEX = 'complex_table'
+    LIST = 'list'
+    LIST_TEXT = 'text_list'
+    LIST_REF = 'reference_list'
+    TITLE = 'title'
+    PARAGRAPH = 'paragraph'
+    SPAN_TEXT = 'text'
+    SPAN_EQUATION_INLINE = 'equation_inline'
+    SPAN_PHONETIC = 'phonetic'
+    SPAN_MD = 'md'
+    SPAN_CODE_INLINE = 'code_inline'
+    PAGE_HEADER = "page_header"
+    PAGE_FOOTER = "page_footer"
+    PAGE_NUMBER = "page_number"
+    PAGE_ASIDE_TEXT = "page_aside_text"
+    PAGE_FOOTNOTE = "page_footnote"
+
+
 class CategoryId:
    Title = 0
    Text = 1
@@ -60,6 +85,7 @@ class MakeMode:
    MM_MD = 'mm_markdown'
    NLP_MD = 'nlp_markdown'
    CONTENT_LIST = 'content_list'
+    CONTENT_LIST_V2 = 'content_list_v2'


 class ModelPath: