Compare commits

...

21 Commits

Author SHA1 Message Date
Xiaomeng Zhao
380cb4d9ea Merge pull request #1838 from myhloli/dev
refactor(magic_pdf): improve paragraph splitting logic and update dep…
2025-03-04 19:09:55 +08:00
myhloli
842483ccb3 refactor(magic_pdf): improve paragraph splitting logic and update dependencies
- Optimize paragraph splitting algorithm for better text block separation
- Update fast-langdetect dependency to ensure compatibility
2025-03-04 19:08:18 +08:00
Xiaomeng Zhao
7d99c1f614 Merge pull request #1829 from opendatalab/master
master -> dev
2025-03-03 18:24:30 +08:00
Xiaomeng Zhao
5db81bf07c Update python-package.yml 2025-03-03 18:23:36 +08:00
myhloli
da0c2eaa36 Update version.py with new version 2025-03-03 10:00:34 +00:00
Xiaomeng Zhao
9f6b5361a4 Merge pull request #1825 from opendatalab/release-1.2.1
Release 1.2.1
2025-03-03 17:59:07 +08:00
myhloli
d64182ea82 Update version.py with new version 2025-02-27 03:01:07 +00:00
Xiaomeng Zhao
21451be2dc Merge pull request #1793 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-27 10:55:37 +08:00
Xiaomeng Zhao
97633eb1e2 Merge pull request #1792 from opendatalab/dev
fix: match multiple captions
2025-02-27 10:48:42 +08:00
Xiaomeng Zhao
98b215b8a9 Merge pull request #1789 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-26 18:20:16 +08:00
Xiaomeng Zhao
b6d60bb0fa Merge pull request #1788 from opendatalab/dev
refactor(magic_pdf): remove bfloat16 support checks and usage
2025-02-26 18:19:15 +08:00
github-actions[bot]
82d4e451fd @luckymore has signed the CLA in opendatalab/MinerU#1785 2025-02-26 09:43:28 +00:00
Xiaomeng Zhao
e0b74b8664 Merge pull request #1778 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-25 18:19:09 +08:00
Xiaomeng Zhao
d3c822f827 Merge pull request #1777 from opendatalab/dev
perf(model): optimize batch analyze process
2025-02-25 18:18:41 +08:00
Xiaomeng Zhao
6e0d9a3e70 Merge pull request #1771 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-25 15:05:33 +08:00
Xiaomeng Zhao
dd7af4d4d9 Merge pull request #1770 from opendatalab/dev
docs(ascend): update Ascend NPU acceleration documentation
2025-02-25 15:04:58 +08:00
github-actions[bot]
815551fe06 @BetterAndBetterII has signed the CLA in opendatalab/MinerU#1767 2025-02-25 05:21:19 +00:00
Xiaomeng Zhao
7a88003a95 Merge pull request #1762 from opendatalab/release-1.2.0
update 1.2.0 release note
2025-02-24 18:36:52 +08:00
Xiaomeng Zhao
0b389e5022 Merge pull request #1761 from opendatalab/dev
docs(README): update release notes for version 1.2.0
2025-02-24 18:35:13 +08:00
Xiaomeng Zhao
1f49712974 Merge pull request #1759 from opendatalab/release-1.2.0
Release 1.2.0
2025-02-24 17:36:00 +08:00
github-actions[bot]
9a87d3eaad @nadahlberg has signed the CLA in opendatalab/MinerU#1748 2025-02-22 17:04:26 +00:00
5 changed files with 43 additions and 16 deletions

View File

@@ -122,6 +122,6 @@ jobs:
- name: Publish distribution to PyPI
run: |
pip install twine
pip install -U twine id keyring packaging readme-renderer requests requests-toolbelt rfc3986 rich urllib3
twine check dist/*
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}

View File

@@ -1 +1 @@
__version__ = "1.1.0"
__version__ = "1.2.1"

View File

@@ -108,6 +108,22 @@ def __is_list_or_index_block(block):
):
multiple_para_flag = True
block_text = ''
for line in block['lines']:
line_text = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
# 添加所有文本包括空行保持与block['lines']长度一致
lines_text_list.append(line_text)
block_text = ''.join(lines_text_list)
block_lang = detect_lang(block_text)
# logger.info(f"block_lang: {block_lang}")
for line in block['lines']:
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
@@ -119,19 +135,6 @@ def __is_list_or_index_block(block):
if abs(line_mid_x - block_mid_x) < line_height / 2:
center_close_num += 1
line_text = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
# 添加所有文本包括空行保持与block['lines']长度一致
lines_text_list.append(line_text)
block_text = ''.join(lines_text_list)
block_lang = detect_lang(block_text)
# logger.info(f"block_lang: {block_lang}")
# 计算line左侧顶格数量是否大于2是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
left_close_num += 1

View File

@@ -1,7 +1,7 @@
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
fast-langdetect>=0.2.3
fast-langdetect>=0.2.3,<0.3.0
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
pydantic>=2.7.2

View File

@@ -159,6 +159,30 @@
"created_at": "2025-02-22T07:15:35Z",
"repoId": 765083837,
"pullRequestNo": 1743
},
{
"name": "nadahlberg",
"id": 58701810,
"comment_id": 2676309097,
"created_at": "2025-02-22T17:04:14Z",
"repoId": 765083837,
"pullRequestNo": 1748
},
{
"name": "BetterAndBetterII",
"id": 141388234,
"comment_id": 2680567709,
"created_at": "2025-02-25T05:21:05Z",
"repoId": 765083837,
"pullRequestNo": 1767
},
{
"name": "luckymore",
"id": 5390013,
"comment_id": 2684392503,
"created_at": "2025-02-26T09:23:25Z",
"repoId": 765083837,
"pullRequestNo": 1785
}
]
}