mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
@@ -5,7 +5,6 @@ from loguru import logger
|
|||||||
|
|
||||||
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
||||||
from mineru.utils.enum_class import ContentType, BlockType
|
from mineru.utils.enum_class import ContentType, BlockType
|
||||||
from mineru.utils.guess_suffix_or_lang import guess_language_by_text
|
|
||||||
from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
|
from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
|
||||||
|
|
||||||
|
|
||||||
@@ -15,6 +14,10 @@ class MagicModel:
|
|||||||
|
|
||||||
blocks = []
|
blocks = []
|
||||||
self.all_spans = []
|
self.all_spans = []
|
||||||
|
|
||||||
|
# 对caption块进行分类,将其分类为image_caption或table_caption
|
||||||
|
page_blocks = classify_caption_blocks(page_blocks)
|
||||||
|
|
||||||
# 解析每个块
|
# 解析每个块
|
||||||
for index, block_info in enumerate(page_blocks):
|
for index, block_info in enumerate(page_blocks):
|
||||||
|
|
||||||
@@ -24,7 +27,8 @@ class MagicModel:
|
|||||||
if block_type in [
|
if block_type in [
|
||||||
"text",
|
"text",
|
||||||
"title",
|
"title",
|
||||||
"caption",
|
"image_caption",
|
||||||
|
"table_caption",
|
||||||
"header",
|
"header",
|
||||||
"footer",
|
"footer",
|
||||||
]:
|
]:
|
||||||
@@ -44,13 +48,14 @@ class MagicModel:
|
|||||||
"type": span_type,
|
"type": span_type,
|
||||||
}
|
}
|
||||||
if span_type == ContentType.TABLE:
|
if span_type == ContentType.TABLE:
|
||||||
span["html"] = block_info.get("content", "")
|
span["html"] = clean_table_html(block_info.get("content", ""))
|
||||||
elif span_type == ContentType.IMAGE:
|
elif span_type == ContentType.IMAGE:
|
||||||
|
# jpg格式base64
|
||||||
span["image_base64"] = block_info.get("content", "")
|
span["image_base64"] = block_info.get("content", "")
|
||||||
elif span_type in [ContentType.INTERLINE_EQUATION]:
|
elif span_type in [ContentType.INTERLINE_EQUATION]:
|
||||||
span = {
|
span = {
|
||||||
"type": span_type,
|
"type": span_type,
|
||||||
"content": block_content,
|
"content": block_info.get("content", ""),
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
|
|
||||||
@@ -217,6 +222,80 @@ class MagicModel:
|
|||||||
return self.all_spans
|
return self.all_spans
|
||||||
|
|
||||||
|
|
||||||
|
def clean_table_html(html: str) -> str:
|
||||||
|
"""
|
||||||
|
清洗表格HTML,只保留对表格结构表示有用的信息。
|
||||||
|
|
||||||
|
保留的属性:
|
||||||
|
- colspan: 列合并
|
||||||
|
- rowspan: 行合并
|
||||||
|
|
||||||
|
清洗的内容:
|
||||||
|
- 移除所有style属性
|
||||||
|
- 移除所有class属性
|
||||||
|
- 移除border等其他属性
|
||||||
|
- 保持表格结构标签(table, thead, tbody, tr, th, td等)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html: 原始表格HTML字符串
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
清洗后的HTML字符串
|
||||||
|
"""
|
||||||
|
if not html:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 需要保留的属性(对表格结构有用)
|
||||||
|
preserved_attrs = {'colspan', 'rowspan'}
|
||||||
|
|
||||||
|
def clean_tag(match):
|
||||||
|
"""清洗单个标签,只保留结构相关的属性"""
|
||||||
|
full_tag = match.group(0)
|
||||||
|
tag_name = match.group(1).lower()
|
||||||
|
|
||||||
|
# 自闭合标签的处理
|
||||||
|
is_self_closing = full_tag.rstrip().endswith('/>')
|
||||||
|
|
||||||
|
# 提取需要保留的属性
|
||||||
|
kept_attrs = []
|
||||||
|
|
||||||
|
# 匹配所有属性: attr="value" 或 attr='value' 或 attr=value 或单独的attr
|
||||||
|
attr_pattern = r'(\w+)\s*=\s*(?:"([^"]*)"|\'([^\']*)\'|(\S+))|(\w+)(?=\s|>|/>)'
|
||||||
|
for attr_match in re.finditer(attr_pattern, full_tag):
|
||||||
|
if attr_match.group(5):
|
||||||
|
# 单独的属性(如 disabled),跳过
|
||||||
|
continue
|
||||||
|
|
||||||
|
attr_name = attr_match.group(1)
|
||||||
|
if attr_name is None:
|
||||||
|
continue
|
||||||
|
attr_name = attr_name.lower()
|
||||||
|
attr_value = attr_match.group(2) or attr_match.group(3) or attr_match.group(4) or ""
|
||||||
|
|
||||||
|
# 只保留colspan和rowspan
|
||||||
|
if attr_name in preserved_attrs:
|
||||||
|
kept_attrs.append(f'{attr_name}="{attr_value}"')
|
||||||
|
|
||||||
|
# 重建标签
|
||||||
|
if kept_attrs:
|
||||||
|
attrs_str = ' ' + ' '.join(kept_attrs)
|
||||||
|
else:
|
||||||
|
attrs_str = ''
|
||||||
|
|
||||||
|
if is_self_closing:
|
||||||
|
return f'<{tag_name}{attrs_str}/>'
|
||||||
|
else:
|
||||||
|
return f'<{tag_name}{attrs_str}>'
|
||||||
|
|
||||||
|
# 匹配开始标签(包括自闭合标签),捕获标签名
|
||||||
|
# 匹配 <tagname ...> 或 <tagname .../>
|
||||||
|
tag_pattern = r'<(\w+)(?:\s+[^>]*)?\s*/?>'
|
||||||
|
|
||||||
|
result = re.sub(tag_pattern, clean_tag, html)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def isolated_formula_clean(txt):
|
def isolated_formula_clean(txt):
|
||||||
latex = txt[:]
|
latex = txt[:]
|
||||||
if latex.startswith("\\["): latex = latex[2:]
|
if latex.startswith("\\["): latex = latex[2:]
|
||||||
@@ -485,4 +564,76 @@ def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
|
|||||||
else:
|
else:
|
||||||
list_block["sub_type"] = "unknown"
|
list_block["sub_type"] = "unknown"
|
||||||
|
|
||||||
return list_blocks, text_blocks, ref_text_blocks
|
return list_blocks, text_blocks, ref_text_blocks
|
||||||
|
|
||||||
|
|
||||||
|
def classify_caption_blocks(page_blocks: list) -> list:
|
||||||
|
"""
|
||||||
|
对page_blocks中的caption块进行分类,将其分类为image_caption或table_caption。
|
||||||
|
|
||||||
|
规则:
|
||||||
|
1. 只有与type为table或image相邻的caption可以作为caption
|
||||||
|
2. caption块与table或image中相隔的块全部是caption的情况视为该caption块与table或image相邻
|
||||||
|
3. caption的类型与他前置位相邻的母块type一致(table或image),如果没有前置位母块则检查是否有后置位母块
|
||||||
|
4. 没有相邻母块的caption需要变更type为text
|
||||||
|
"""
|
||||||
|
if not page_blocks:
|
||||||
|
return page_blocks
|
||||||
|
|
||||||
|
available_types = ["table", "image"]
|
||||||
|
|
||||||
|
result_blocks = []
|
||||||
|
n = len(page_blocks)
|
||||||
|
|
||||||
|
for i, block in enumerate(page_blocks):
|
||||||
|
if block.get("type") != "caption":
|
||||||
|
result_blocks.append(block)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 查找前置位相邻的母块(table或image)
|
||||||
|
# 向前查找,跳过连续的caption块
|
||||||
|
prev_parent_type = None
|
||||||
|
j = i - 1
|
||||||
|
while j >= 0:
|
||||||
|
prev_block_type = page_blocks[j].get("type")
|
||||||
|
if prev_block_type in available_types:
|
||||||
|
prev_parent_type = prev_block_type
|
||||||
|
break
|
||||||
|
elif prev_block_type == "caption":
|
||||||
|
# 继续向前查找
|
||||||
|
j -= 1
|
||||||
|
else:
|
||||||
|
# 遇到非caption且非table/image的块,停止查找
|
||||||
|
break
|
||||||
|
|
||||||
|
# 查找后置位相邻的母块(table或image)
|
||||||
|
# 向后查找,跳过连续的caption块
|
||||||
|
next_parent_type = None
|
||||||
|
k = i + 1
|
||||||
|
while k < n:
|
||||||
|
next_block_type = page_blocks[k].get("type")
|
||||||
|
if next_block_type in available_types:
|
||||||
|
next_parent_type = next_block_type
|
||||||
|
break
|
||||||
|
elif next_block_type == "caption":
|
||||||
|
# 继续向后查找
|
||||||
|
k += 1
|
||||||
|
else:
|
||||||
|
# 遇到非caption且非table/image的块,停止查找
|
||||||
|
break
|
||||||
|
|
||||||
|
# 根据规则确定caption类型
|
||||||
|
new_block = block.copy()
|
||||||
|
if prev_parent_type:
|
||||||
|
# 优先使用前置位母块的类型
|
||||||
|
new_block["type"] = f"{prev_parent_type}_caption"
|
||||||
|
elif next_parent_type:
|
||||||
|
# 没有前置位母块,使用后置位母块的类型
|
||||||
|
new_block["type"] = f"{next_parent_type}_caption"
|
||||||
|
else:
|
||||||
|
# 没有相邻母块,变更为text
|
||||||
|
new_block["type"] = "text"
|
||||||
|
|
||||||
|
result_blocks.append(new_block)
|
||||||
|
|
||||||
|
return result_blocks
|
||||||
|
|||||||
Reference in New Issue
Block a user