From 6725648080c436487a5c22fbd25cc710e4069ded Mon Sep 17 00:00:00 2001
From: myhloli <moe@myhloli.com>
Date: Mon, 26 Jan 2026 15:21:48 +0800
Subject: [PATCH] feat: enhance DOCX processing to classify caption blocks and
 clean table HTML

---
 mineru/backend/office/office_magic_model.py | 161 +++++++++++++++++++-
 1 file changed, 156 insertions(+), 5 deletions(-)

diff --git a/mineru/backend/office/office_magic_model.py b/mineru/backend/office/office_magic_model.py
index 9f1d72a1..1eb24e7a 100644
--- a/mineru/backend/office/office_magic_model.py
+++ b/mineru/backend/office/office_magic_model.py
@@ -5,7 +5,6 @@ from loguru import logger
 
 from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from mineru.utils.enum_class import ContentType, BlockType
-from mineru.utils.guess_suffix_or_lang import guess_language_by_text
 from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
 
 
@@ -15,6 +14,10 @@ class MagicModel:
 
         blocks = []
         self.all_spans = []
+
+        # 对caption块进行分类，将其分类为image_caption或table_caption
+        page_blocks = classify_caption_blocks(page_blocks)
+
         # 解析每个块
         for index, block_info in enumerate(page_blocks):
 
@@ -24,7 +27,8 @@ class MagicModel:
             if block_type in [
                 "text",
                 "title",
-                "caption",
+                "image_caption",
+                "table_caption",
                 "header",
                 "footer",
             ]:
@@ -44,13 +48,14 @@ class MagicModel:
                     "type": span_type,
                 }
                 if span_type == ContentType.TABLE:
-                    span["html"] = block_info.get("content", "")
+                    span["html"] = clean_table_html(block_info.get("content", ""))
                 elif span_type == ContentType.IMAGE:
+                    # jpg格式base64
                     span["image_base64"] = block_info.get("content", "")
             elif span_type in [ContentType.INTERLINE_EQUATION]:
                 span = {
                     "type": span_type,
-                    "content": block_content,
+                    "content": block_info.get("content", ""),
                 }
             else:
 
@@ -217,6 +222,80 @@ class MagicModel:
         return self.all_spans
 
 
+def clean_table_html(html: str) -> str:
+    """
+    清洗表格HTML，只保留对表格结构表示有用的信息。
+
+    保留的属性：
+    - colspan: 列合并
+    - rowspan: 行合并
+
+    清洗的内容：
+    - 移除所有style属性
+    - 移除所有class属性
+    - 移除border等其他属性
+    - 保持表格结构标签（table, thead, tbody, tr, th, td等）
+
+    Args:
+        html: 原始表格HTML字符串
+
+    Returns:
+        清洗后的HTML字符串
+    """
+    if not html:
+        return ""
+
+    # 需要保留的属性（对表格结构有用）
+    preserved_attrs = {'colspan', 'rowspan'}
+
+    def clean_tag(match):
+        """清洗单个标签，只保留结构相关的属性"""
+        full_tag = match.group(0)
+        tag_name = match.group(1).lower()
+
+        # 自闭合标签的处理
+        is_self_closing = full_tag.rstrip().endswith('/>')
+
+        # 提取需要保留的属性
+        kept_attrs = []
+
+        # 匹配所有属性: attr="value" 或 attr='value' 或 attr=value 或单独的attr
+        attr_pattern = r'(\w+)\s*=\s*(?:"([^"]*)"|\'([^\']*)\'|(\S+))|(\w+)(?=\s|>|/>)'
+        for attr_match in re.finditer(attr_pattern, full_tag):
+            if attr_match.group(5):
+                # 单独的属性（如 disabled），跳过
+                continue
+
+            attr_name = attr_match.group(1)
+            if attr_name is None:
+                continue
+            attr_name = attr_name.lower()
+            attr_value = attr_match.group(2) or attr_match.group(3) or attr_match.group(4) or ""
+
+            # 只保留colspan和rowspan
+            if attr_name in preserved_attrs:
+                kept_attrs.append(f'{attr_name}="{attr_value}"')
+
+        # 重建标签
+        if kept_attrs:
+            attrs_str = ' ' + ' '.join(kept_attrs)
+        else:
+            attrs_str = ''
+
+        if is_self_closing:
+            return f'<{tag_name}{attrs_str}/>'
+        else:
+            return f'<{tag_name}{attrs_str}>'
+
+    # 匹配开始标签（包括自闭合标签），捕获标签名
+    # 匹配 <tagname ...> 或 <tagname .../>
+    tag_pattern = r'<(\w+)(?:\s+[^>]*)?\s*/?>'
+
+    result = re.sub(tag_pattern, clean_tag, html)
+
+    return result
+
+
 def isolated_formula_clean(txt):
     latex = txt[:]
     if latex.startswith("\\["): latex = latex[2:]
@@ -485,4 +564,76 @@ def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
         else:
             list_block["sub_type"] = "unknown"
 
-    return list_blocks, text_blocks, ref_text_blocks
\ No newline at end of file
+    return list_blocks, text_blocks, ref_text_blocks
+
+
+def classify_caption_blocks(page_blocks: list) -> list:
+    """
+    对page_blocks中的caption块进行分类，将其分类为image_caption或table_caption。
+
+    规则：
+    1. 只有与type为table或image相邻的caption可以作为caption
+    2. caption块与table或image中相隔的块全部是caption的情况视为该caption块与table或image相邻
+    3. caption的类型与他前置位相邻的母块type一致（table或image），如果没有前置位母块则检查是否有后置位母块
+    4. 没有相邻母块的caption需要变更type为text
+    """
+    if not page_blocks:
+        return page_blocks
+
+    available_types = ["table", "image"]
+
+    result_blocks = []
+    n = len(page_blocks)
+
+    for i, block in enumerate(page_blocks):
+        if block.get("type") != "caption":
+            result_blocks.append(block)
+            continue
+
+        # 查找前置位相邻的母块（table或image）
+        # 向前查找，跳过连续的caption块
+        prev_parent_type = None
+        j = i - 1
+        while j >= 0:
+            prev_block_type = page_blocks[j].get("type")
+            if prev_block_type in available_types:
+                prev_parent_type = prev_block_type
+                break
+            elif prev_block_type == "caption":
+                # 继续向前查找
+                j -= 1
+            else:
+                # 遇到非caption且非table/image的块，停止查找
+                break
+
+        # 查找后置位相邻的母块（table或image）
+        # 向后查找，跳过连续的caption块
+        next_parent_type = None
+        k = i + 1
+        while k < n:
+            next_block_type = page_blocks[k].get("type")
+            if next_block_type in available_types:
+                next_parent_type = next_block_type
+                break
+            elif next_block_type == "caption":
+                # 继续向后查找
+                k += 1
+            else:
+                # 遇到非caption且非table/image的块，停止查找
+                break
+
+        # 根据规则确定caption类型
+        new_block = block.copy()
+        if prev_parent_type:
+            # 优先使用前置位母块的类型
+            new_block["type"] = f"{prev_parent_type}_caption"
+        elif next_parent_type:
+            # 没有前置位母块，使用后置位母块的类型
+            new_block["type"] = f"{next_parent_type}_caption"
+        else:
+            # 没有相邻母块，变更为text
+            new_block["type"] = "text"
+
+        result_blocks.append(new_block)
+
+    return result_blocks