Compare commits

...

35 Commits

Author SHA1 Message Date
Xiaomeng Zhao
6f571bb70e Merge pull request #1839 from opendatalab/release-1.2.2
Release 1.2.2
2025-03-04 20:48:22 +08:00
Xiaomeng Zhao
380cb4d9ea Merge pull request #1838 from myhloli/dev
refactor(magic_pdf): improve paragraph splitting logic and update dep…
2025-03-04 19:09:55 +08:00
myhloli
842483ccb3 refactor(magic_pdf): improve paragraph splitting logic and update dependencies
- Optimize paragraph splitting algorithm for better text block separation
- Update fast-langdetect dependency to ensure compatibility
2025-03-04 19:08:18 +08:00
Xiaomeng Zhao
7d99c1f614 Merge pull request #1829 from opendatalab/master
master -> dev
2025-03-03 18:24:30 +08:00
Xiaomeng Zhao
5db81bf07c Update python-package.yml 2025-03-03 18:23:36 +08:00
myhloli
da0c2eaa36 Update version.py with new version 2025-03-03 10:00:34 +00:00
Xiaomeng Zhao
9f6b5361a4 Merge pull request #1825 from opendatalab/release-1.2.1
Release 1.2.1
2025-03-03 17:59:07 +08:00
Xiaomeng Zhao
843742241a Merge pull request #1828 from opendatalab/dev
docs(readme): update changelog for v1.2.1 release
2025-03-03 17:58:52 +08:00
Xiaomeng Zhao
a7b5b8b882 Merge pull request #1827 from myhloli/dev
fix(readme): update changelog for v1.2.1 release
2025-03-03 17:58:08 +08:00
myhloli
f6f73050ad fix(readme): update changelog for v1.2.1 release
- Update README.md and README_zh-CN.md with the latest changes
- Add details about bug fixes in version1.2.1
- Include improvements for full-width to half-width conversion, caption matching, and formula span issues
2025-03-03 17:57:15 +08:00
Xiaomeng Zhao
d327e1e996 Merge pull request #1824 from myhloli/dev
perf(inference): adjust batch ratio for high GPU memory
2025-03-03 17:31:43 +08:00
myhloli
0b05dff74f perf(inference): adjust batch ratio for high GPU memory
- Increase batch ratio to 8 for GPU memory >=16GB
- Improve inference performance on systems with higher GPU memory
2025-03-03 17:30:02 +08:00
Xiaomeng Zhao
f20ab37dbd Merge pull request #1822 from icecraft/fix/caption_match
fix: caption match
2025-03-03 17:17:49 +08:00
Xiaomeng Zhao
98adcbb9c8 Merge pull request #1823 from myhloli/dev
refactor(pre_proc): allow interline equations to be associated with text blocks
2025-03-03 17:16:27 +08:00
myhloli
083b787c15 refactor(pre_proc): allow interline equations to be associated with text blocks
- Update OCR dictionary merge logic to include text blocks when processing interline equations
- This change improves the handling of equations that may be embedded within text content
2025-03-03 17:14:13 +08:00
icecraft
fb02be19c9 fix: caption match 2025-03-03 17:08:17 +08:00
Xiaomeng Zhao
a01bd7eda7 Merge pull request #1821 from myhloli/dev
perf(mfr): improve Math Formula Recognition by sorting images by area
2025-03-03 16:04:52 +08:00
myhloli
58b6ad8ca2 perf(inference): adjust batch ratio for GPU memory sizes
- Simplify batch ratio logic for GPU memory >= 16GB
- Remove unnecessary conditions for 20GB and 40GB memory
2025-03-03 15:43:50 +08:00
myhloli
0d3304d7c9 perf(inference): adjust batch ratio for GPU memory sizes
- Simplify batch ratio logic for GPU memory >= 16GB
- Remove unnecessary conditions for 20GB and 40GB memory
2025-03-03 15:36:52 +08:00
myhloli
59fc80d473 perf(mfr): improve Math Formula Recognition by sorting images by area
- Sort detected images by area before processing to enhance MFR accuracy
- Implement stable sorting to maintain original order of images with equal
2025-03-03 15:26:12 +08:00
myhloli
6bfc17119d refactor(pdf_parse): comment out performance measurement and logging
- Comment out @measure_time decorator for txt_spans_extract_v2 and sort_lines_by_model functions
- Remove logger.info for page_process_time
- Comment out PerformanceStats.print_stats call
2025-03-03 15:04:18 +08:00
myhloli
e516cf535c feat(performance): add performance monitoring and optimization
- Add performance_stats module to measure and print execution time statistics
- Implement measure_time decorator to track execution time of key functions
- Remove multi-threading in pdf parsing for better resource management
- Optimize pdf parsing logic for improved performance
2025-03-03 14:52:48 +08:00
myhloli
6ec440d6f1 feat(pdf_parse): implement multi-threaded page processing
- Add ThreadPoolExecutor to process PDF pages in parallel
- Create separate function for page processing to improve readability and maintainability
- Include error handling for individual page processing tasks
- Log total page processing time for performance monitoring
2025-02-28 19:02:05 +08:00
Xiaomeng Zhao
058c349c24 Merge pull request #1799 from myhloli/dev
refactor(ocr_mkcontent): optimize full-width character handling
2025-02-27 18:08:49 +08:00
myhloli
df1b8f598f refactor(ocr_mkcontent): optimize full-width character handling
- Update condition to only convert full-width letters and numbers
- Remove separate case for full-width space
2025-02-27 17:26:37 +08:00
myhloli
d64182ea82 Update version.py with new version 2025-02-27 03:01:07 +00:00
Xiaomeng Zhao
21451be2dc Merge pull request #1793 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-27 10:55:37 +08:00
Xiaomeng Zhao
98b215b8a9 Merge pull request #1789 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-26 18:20:16 +08:00
github-actions[bot]
82d4e451fd @luckymore has signed the CLA in opendatalab/MinerU#1785 2025-02-26 09:43:28 +00:00
Xiaomeng Zhao
e0b74b8664 Merge pull request #1778 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-25 18:19:09 +08:00
Xiaomeng Zhao
6e0d9a3e70 Merge pull request #1771 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-25 15:05:33 +08:00
github-actions[bot]
815551fe06 @BetterAndBetterII has signed the CLA in opendatalab/MinerU#1767 2025-02-25 05:21:19 +00:00
Xiaomeng Zhao
7a88003a95 Merge pull request #1762 from opendatalab/release-1.2.0
update 1.2.0 release note
2025-02-24 18:36:52 +08:00
Xiaomeng Zhao
1f49712974 Merge pull request #1759 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-24 17:36:00 +08:00
github-actions[bot]
9a87d3eaad @nadahlberg has signed the CLA in opendatalab/MinerU#1748 2025-02-22 17:04:26 +00:00
14 changed files with 191 additions and 41 deletions

View File

@@ -122,6 +122,6 @@ jobs:
- name: Publish distribution to PyPI
run: |
pip install twine
pip install -U twine id keyring packaging readme-renderer requests requests-toolbelt rfc3986 rich urllib3
twine check dist/*
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}

View File

@@ -47,6 +47,10 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
</div>
# Changelog
- 2025/03/03 1.2.1 released, fixed several bugs:
- Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers
- Fixed caption matching inaccuracies in certain scenarios
- Fixed formula span loss issues in certain scenarios
- 2025/02/24 1.2.0 released. This version includes several fixes and improvements to enhance parsing efficiency and accuracy:
- Performance Optimization
- Increased classification speed for PDF documents in auto mode.

View File

@@ -46,6 +46,10 @@
</div>
# 更新记录
- 2025/03/03 1.2.1 发布,修复了一些问题:
- 修复在字母与数字的全角转半角操作时对标点符号的影响
- 修复在某些情况下caption的匹配不准确问题
- 修复在某些情况下的公式span丢失问题
- 2025/02/24 1.2.0 发布,这个版本我们修复了一些问题,提升了解析的效率与精度:
- 性能优化
- auto模式下pdf文档的分类速度提升

View File

@@ -138,12 +138,9 @@ def full_to_half(text: str) -> str:
result = []
for char in text:
code = ord(char)
# Full-width ASCII variants (FF01-FF5E)
if 0xFF01 <= code <= 0xFF5E:
# Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
# Full-width space
elif code == 0x3000:
result.append(' ')
else:
result.append(char)
return ''.join(result)

View File

@@ -0,0 +1,54 @@
import time
import functools
from collections import defaultdict
from typing import Dict, List
class PerformanceStats:
"""性能统计类,用于收集和展示方法执行时间"""
_stats: Dict[str, List[float]] = defaultdict(list)
@classmethod
def add_execution_time(cls, func_name: str, execution_time: float):
"""添加执行时间记录"""
cls._stats[func_name].append(execution_time)
@classmethod
def get_stats(cls) -> Dict[str, dict]:
"""获取统计结果"""
results = {}
for func_name, times in cls._stats.items():
results[func_name] = {
'count': len(times),
'total_time': sum(times),
'avg_time': sum(times) / len(times),
'min_time': min(times),
'max_time': max(times)
}
return results
@classmethod
def print_stats(cls):
"""打印统计结果"""
stats = cls.get_stats()
print("\n性能统计结果:")
print("-" * 80)
print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}")
print("-" * 80)
for func_name, data in stats.items():
print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}")
def measure_time(func):
"""测量方法执行时间的装饰器"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
execution_time = time.time() - start_time
PerformanceStats.add_execution_time(func.__name__, execution_time)
return result
return wrapper

View File

@@ -1 +1 @@
__version__ = "1.1.0"
__version__ = "1.2.1"

View File

@@ -170,11 +170,7 @@ def doc_analyze(
gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
if gpu_memory is not None and gpu_memory >= 8:
if gpu_memory >= 40:
batch_ratio = 32
elif gpu_memory >=20:
batch_ratio = 16
elif gpu_memory >= 16:
if gpu_memory >= 16:
batch_ratio = 8
elif gpu_memory >= 10:
batch_ratio = 4

View File

@@ -528,14 +528,13 @@ class MagicModel:
pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox'])
nearest_dis = float('inf')
for i in range(N):
if i in seen_idx:continue
if i in seen_idx or i == sub_idx:continue
nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox']))
if pair_dis >= 3*nearest_dis:
seen_idx.add(sub_idx)
continue
seen_idx.add(sub_idx)
seen_idx.add(obj_idx + OBJ_IDX_OFFSET)
seen_sub_idx.add(sub_idx)

View File

@@ -100,20 +100,61 @@ class UnimernetModel(object):
res["latex"] = latex_rm_whitespace(latex)
return formula_list
def batch_predict(
self, images_mfd_res: list, images: list, batch_size: int = 64
) -> list:
# def batch_predict(
# self, images_mfd_res: list, images: list, batch_size: int = 64
# ) -> list:
# images_formula_list = []
# mf_image_list = []
# backfill_list = []
# for image_index in range(len(images_mfd_res)):
# mfd_res = images_mfd_res[image_index]
# pil_img = Image.fromarray(images[image_index])
# formula_list = []
#
# for xyxy, conf, cla in zip(
# mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
# ):
# xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
# new_item = {
# "category_id": 13 + int(cla.item()),
# "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
# "score": round(float(conf.item()), 2),
# "latex": "",
# }
# formula_list.append(new_item)
# bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
# mf_image_list.append(bbox_img)
#
# images_formula_list.append(formula_list)
# backfill_list += formula_list
#
# dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
# dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
# mfr_res = []
# for mf_img in dataloader:
# mf_img = mf_img.to(self.device)
# with torch.no_grad():
# output = self.model.generate({"image": mf_img})
# mfr_res.extend(output["pred_str"])
# for res, latex in zip(backfill_list, mfr_res):
# res["latex"] = latex_rm_whitespace(latex)
# return images_formula_list
def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
images_formula_list = []
mf_image_list = []
backfill_list = []
image_info = [] # Store (area, original_index, image) tuples
# Collect images with their original indices
for image_index in range(len(images_mfd_res)):
mfd_res = images_mfd_res[image_index]
pil_img = Image.fromarray(images[image_index])
formula_list = []
for xyxy, conf, cla in zip(
mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
):
for idx, (xyxy, conf, cla) in enumerate(zip(
mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
)):
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
new_item = {
"category_id": 13 + int(cla.item()),
@@ -123,19 +164,43 @@ class UnimernetModel(object):
}
formula_list.append(new_item)
bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
area = (xmax - xmin) * (ymax - ymin)
curr_idx = len(mf_image_list)
image_info.append((area, curr_idx, bbox_img))
mf_image_list.append(bbox_img)
images_formula_list.append(formula_list)
backfill_list += formula_list
dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
# Stable sort by area
image_info.sort(key=lambda x: x[0]) # sort by area
sorted_indices = [x[1] for x in image_info]
sorted_images = [x[2] for x in image_info]
# Create mapping for results
index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)}
# Create dataset with sorted images
dataset = MathDataset(sorted_images, transform=self.mfr_transform)
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
# Process batches and store results
mfr_res = []
for mf_img in dataloader:
mf_img = mf_img.to(self.device)
with torch.no_grad():
output = self.model.generate({"image": mf_img})
mfr_res.extend(output["pred_str"])
for res, latex in zip(backfill_list, mfr_res):
res["latex"] = latex_rm_whitespace(latex)
# Restore original order
unsorted_results = [""] * len(mfr_res)
for new_idx, latex in enumerate(mfr_res):
original_idx = index_mapping[new_idx]
unsorted_results[original_idx] = latex_rm_whitespace(latex)
# Fill results back
for res, latex in zip(backfill_list, unsorted_results):
res["latex"] = latex
return images_formula_list

View File

@@ -21,9 +21,12 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
from magic_pdf.model.magic_model import MagicModel
from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
from concurrent.futures import ThreadPoolExecutor
try:
import torchtext
@@ -215,7 +218,7 @@ def calculate_contrast(img, img_mode) -> float:
# logger.info(f"contrast: {contrast}")
return round(contrast, 2)
# @measure_time
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
# cid用0xfffd表示连字符拆开
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
@@ -489,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
else:
return [[x0, y0, x1, y1]]
# @measure_time
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
page_line_list = []
@@ -923,7 +926,6 @@ def pdf_parse_union(
magic_model = MagicModel(model_list, dataset)
"""根据输入的起始范围解析pdf"""
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
@@ -960,6 +962,8 @@ def pdf_parse_union(
)
pdf_info_dict[f'page_{page_id}'] = page_info
# PerformanceStats.print_stats()
"""分段"""
para_split(pdf_info_dict)

View File

@@ -108,6 +108,22 @@ def __is_list_or_index_block(block):
):
multiple_para_flag = True
block_text = ''
for line in block['lines']:
line_text = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
# 添加所有文本包括空行保持与block['lines']长度一致
lines_text_list.append(line_text)
block_text = ''.join(lines_text_list)
block_lang = detect_lang(block_text)
# logger.info(f"block_lang: {block_lang}")
for line in block['lines']:
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
@@ -119,19 +135,6 @@ def __is_list_or_index_block(block):
if abs(line_mid_x - block_mid_x) < line_height / 2:
center_close_num += 1
line_text = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
# 添加所有文本包括空行保持与block['lines']长度一致
lines_text_list.append(line_text)
block_text = ''.join(lines_text_list)
block_lang = detect_lang(block_text)
# logger.info(f"block_lang: {block_lang}")
# 计算line左侧顶格数量是否大于2是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
left_close_num += 1

View File

@@ -64,7 +64,7 @@ def span_block_type_compatible(span_type, block_type):
if span_type in [ContentType.Text, ContentType.InlineEquation]:
return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
elif span_type == ContentType.InterlineEquation:
return block_type in [BlockType.InterlineEquation]
return block_type in [BlockType.InterlineEquation, BlockType.Text]
elif span_type == ContentType.Image:
return block_type in [BlockType.ImageBody]
elif span_type == ContentType.Table:

View File

@@ -1,7 +1,7 @@
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
fast-langdetect>=0.2.3
fast-langdetect>=0.2.3,<0.3.0
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
pydantic>=2.7.2

View File

@@ -159,6 +159,30 @@
"created_at": "2025-02-22T07:15:35Z",
"repoId": 765083837,
"pullRequestNo": 1743
},
{
"name": "nadahlberg",
"id": 58701810,
"comment_id": 2676309097,
"created_at": "2025-02-22T17:04:14Z",
"repoId": 765083837,
"pullRequestNo": 1748
},
{
"name": "BetterAndBetterII",
"id": 141388234,
"comment_id": 2680567709,
"created_at": "2025-02-25T05:21:05Z",
"repoId": 765083837,
"pullRequestNo": 1767
},
{
"name": "luckymore",
"id": 5390013,
"comment_id": 2684392503,
"created_at": "2025-02-26T09:23:25Z",
"repoId": 765083837,
"pullRequestNo": 1785
}
]
}