mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
16 Commits
release-2.
...
copilot/an
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fbea43f0e0 | ||
|
|
9ac7d8cbf7 | ||
|
|
dc42d7d654 | ||
|
|
1c57a7bd9b | ||
|
|
e2282af933 | ||
|
|
5d7a4b8854 | ||
|
|
263eb3ff4c | ||
|
|
fb2d6ade0b | ||
|
|
ee45dd6555 | ||
|
|
29c6cd8afc | ||
|
|
c732563b6b | ||
|
|
660de69b00 | ||
|
|
076611bdf1 | ||
|
|
26b3f774e0 | ||
|
|
d832388cff | ||
|
|
db40932e6d |
6
.gitignore
vendored
6
.gitignore
vendored
@@ -16,6 +16,12 @@ debug/
|
||||
*.ipynb
|
||||
.idea
|
||||
|
||||
# Python build artifacts
|
||||
*.egg-info/
|
||||
build/
|
||||
dist/
|
||||
*.egg
|
||||
|
||||
# vscode history
|
||||
.history
|
||||
|
||||
|
||||
@@ -50,9 +50,9 @@
|
||||
- [昇腾 Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend)
|
||||
- [平头哥 T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead)
|
||||
- [沐曦 METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX)
|
||||
- [海光 DCU](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
|
||||
- [燧原 GCU](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
|
||||
- [摩尔线程 MUSA](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
|
||||
- [海光 Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
|
||||
- [燧原 Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
|
||||
- [摩尔线程 MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
|
||||
- MinerU 持续兼容国产硬件平台,支持主流芯片架构。以安全可靠的技术,助力科研、政企用户迈向文档数字化新高度!
|
||||
- 跨页表合并优化,提升合并成功率与合并效果
|
||||
|
||||
|
||||
@@ -424,7 +424,8 @@ def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
|
||||
# 调用通用方法
|
||||
return tie_up_category_by_index(
|
||||
get_subjects,
|
||||
get_objects
|
||||
get_objects,
|
||||
object_block_type=object_block_type
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -349,7 +349,8 @@ def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
|
||||
# 调用通用方法
|
||||
return tie_up_category_by_index(
|
||||
get_subjects,
|
||||
get_objects
|
||||
get_objects,
|
||||
object_block_type=object_block_type
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
包含两个MagicModel类中重复使用的方法和逻辑
|
||||
"""
|
||||
from typing import List, Dict, Any, Callable
|
||||
|
||||
from loguru import logger
|
||||
from mineru.utils.boxbase import bbox_distance, bbox_center_distance, is_in
|
||||
|
||||
|
||||
@@ -172,11 +174,15 @@ def tie_up_category_by_index(
|
||||
get_subjects_func: Callable,
|
||||
get_objects_func: Callable,
|
||||
extract_subject_func: Callable = None,
|
||||
extract_object_func: Callable = None
|
||||
extract_object_func: Callable = None,
|
||||
object_block_type: str = "object",
|
||||
):
|
||||
"""
|
||||
基于index的类别关联方法,用于将主体对象与客体对象进行关联
|
||||
客体优先匹配给index最接近的主体,index差值相同时使用bbox中心点距离作为tiebreaker
|
||||
客体优先匹配给index最接近的主体,匹配优先级为:
|
||||
1. index差值(最高优先级)
|
||||
2. bbox边缘距离(相邻边距离)
|
||||
3. bbox中心点距离(最低优先级,作为最终tiebreaker)
|
||||
|
||||
参数:
|
||||
get_subjects_func: 函数,提取主体对象
|
||||
@@ -207,6 +213,29 @@ def tie_up_category_by_index(
|
||||
"sub_idx": i,
|
||||
}
|
||||
|
||||
# 提取所有客体的index集合,用于计算有效index差值
|
||||
object_indices = set(obj["index"] for obj in objects)
|
||||
|
||||
def calc_effective_index_diff(obj_index: int, sub_index: int) -> int:
|
||||
"""
|
||||
计算有效的index差值
|
||||
有效差值 = 绝对差值 - 区间内其他客体的数量
|
||||
即:如果obj_index和sub_index之间的差值是由其他客体造成的,则应该扣除这部分差值
|
||||
"""
|
||||
if obj_index == sub_index:
|
||||
return 0
|
||||
|
||||
start, end = min(obj_index, sub_index), max(obj_index, sub_index)
|
||||
abs_diff = end - start
|
||||
|
||||
# 计算区间(start, end)内有多少个其他客体的index
|
||||
other_objects_count = 0
|
||||
for idx in range(start + 1, end):
|
||||
if idx in object_indices:
|
||||
other_objects_count += 1
|
||||
|
||||
return abs_diff - other_objects_count
|
||||
|
||||
# 为每个客体找到最匹配的主体
|
||||
for obj in objects:
|
||||
if len(subjects) == 0:
|
||||
@@ -217,10 +246,10 @@ def tie_up_category_by_index(
|
||||
min_index_diff = float("inf")
|
||||
best_subject_indices = []
|
||||
|
||||
# 找出index差值最小的所有主体
|
||||
# 找出有效index差值最小的所有主体
|
||||
for i, subject in enumerate(subjects):
|
||||
sub_index = subject["index"]
|
||||
index_diff = abs(obj_index - sub_index)
|
||||
index_diff = calc_effective_index_diff(obj_index, sub_index)
|
||||
|
||||
if index_diff < min_index_diff:
|
||||
min_index_diff = index_diff
|
||||
@@ -228,18 +257,37 @@ def tie_up_category_by_index(
|
||||
elif index_diff == min_index_diff:
|
||||
best_subject_indices.append(i)
|
||||
|
||||
# 如果有多个主体的index差值相同,使用中心点距离作为tiebreaker
|
||||
if len(best_subject_indices) > 1:
|
||||
min_center_dist = float("inf")
|
||||
if len(best_subject_indices) == 1:
|
||||
best_subject_idx = best_subject_indices[0]
|
||||
# 如果有多个主体的index差值相同(最多两个),根据边缘距离进行筛选
|
||||
elif len(best_subject_indices) == 2:
|
||||
# 计算所有候选主体的边缘距离
|
||||
edge_distances = [(idx, bbox_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
|
||||
edge_dist_diff = abs(edge_distances[0][1] - edge_distances[1][1])
|
||||
|
||||
for idx in best_subject_indices:
|
||||
center_dist = bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])
|
||||
if center_dist < min_center_dist:
|
||||
min_center_dist = center_dist
|
||||
best_subject_idx = idx
|
||||
for idx, edge_dist in edge_distances:
|
||||
logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Edge distance: {edge_dist}")
|
||||
|
||||
if edge_dist_diff > 2:
|
||||
# 边缘距离差值大于2,匹配边缘距离更小的主体
|
||||
best_subject_idx = min(edge_distances, key=lambda x: x[1])[0]
|
||||
logger.debug(f"Obj index: {obj_index}, edge_dist_diff > 2, matching to subject with min edge distance, index: {subjects[best_subject_idx]['index']}")
|
||||
elif object_block_type == "table_caption":
|
||||
# 边缘距离差值<=2且为table_caption,匹配index更大的主体
|
||||
best_subject_idx = max(best_subject_indices, key=lambda idx: subjects[idx]["index"])
|
||||
logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and table_caption, matching to later subject with index: {subjects[best_subject_idx]['index']}")
|
||||
elif object_block_type.endswith("footnote"):
|
||||
# 边缘距离差值<=2且为footnote,匹配index更小的主体
|
||||
best_subject_idx = min(best_subject_indices, key=lambda idx: subjects[idx]["index"])
|
||||
logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and footnote, matching to earlier subject with index: {subjects[best_subject_idx]['index']}")
|
||||
else:
|
||||
# 边缘距离差值<=2 且不适用特殊匹配规则,使用中心点距离匹配
|
||||
center_distances = [(idx, bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
|
||||
for idx, center_dist in center_distances:
|
||||
logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Center distance: {center_dist}")
|
||||
best_subject_idx = min(center_distances, key=lambda x: x[1])[0]
|
||||
else:
|
||||
best_subject_idx = best_subject_indices[0]
|
||||
raise ValueError("More than two subjects have the same minimal index difference, which is unexpected.")
|
||||
|
||||
# 将客体添加到最佳主体的obj_bboxes中
|
||||
result_dict[best_subject_idx]["obj_bboxes"].append(extract_object_func(obj))
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.7.1"
|
||||
__version__ = "2.7.3"
|
||||
|
||||
@@ -109,7 +109,7 @@ repository = "https://github.com/opendatalab/MinerU"
|
||||
issues = "https://github.com/opendatalab/MinerU/issues"
|
||||
|
||||
[project.scripts]
|
||||
mineru = "mineru.cli:client.main"
|
||||
mineru = "mineru.cli.client:main"
|
||||
mineru-vllm-server = "mineru.cli.vlm_server:vllm_server"
|
||||
mineru-lmdeploy-server = "mineru.cli.vlm_server:lmdeploy_server"
|
||||
mineru-openai-server = "mineru.cli.vlm_server:openai_server"
|
||||
|
||||
Reference in New Issue
Block a user