mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
21 Commits
release-1.
...
release-1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
380cb4d9ea | ||
|
|
842483ccb3 | ||
|
|
7d99c1f614 | ||
|
|
5db81bf07c | ||
|
|
da0c2eaa36 | ||
|
|
9f6b5361a4 | ||
|
|
d64182ea82 | ||
|
|
21451be2dc | ||
|
|
97633eb1e2 | ||
|
|
98b215b8a9 | ||
|
|
b6d60bb0fa | ||
|
|
82d4e451fd | ||
|
|
e0b74b8664 | ||
|
|
d3c822f827 | ||
|
|
6e0d9a3e70 | ||
|
|
dd7af4d4d9 | ||
|
|
815551fe06 | ||
|
|
7a88003a95 | ||
|
|
0b389e5022 | ||
|
|
1f49712974 | ||
|
|
9a87d3eaad |
2
.github/workflows/python-package.yml
vendored
2
.github/workflows/python-package.yml
vendored
@@ -122,6 +122,6 @@ jobs:
|
||||
|
||||
- name: Publish distribution to PyPI
|
||||
run: |
|
||||
pip install twine
|
||||
pip install -U twine id keyring packaging readme-renderer requests requests-toolbelt rfc3986 rich urllib3
|
||||
twine check dist/*
|
||||
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "1.1.0"
|
||||
__version__ = "1.2.1"
|
||||
|
||||
@@ -108,6 +108,22 @@ def __is_list_or_index_block(block):
|
||||
):
|
||||
multiple_para_flag = True
|
||||
|
||||
block_text = ''
|
||||
|
||||
for line in block['lines']:
|
||||
line_text = ''
|
||||
|
||||
for span in line['spans']:
|
||||
span_type = span['type']
|
||||
if span_type == ContentType.Text:
|
||||
line_text += span['content'].strip()
|
||||
# 添加所有文本,包括空行,保持与block['lines']长度一致
|
||||
lines_text_list.append(line_text)
|
||||
block_text = ''.join(lines_text_list)
|
||||
|
||||
block_lang = detect_lang(block_text)
|
||||
# logger.info(f"block_lang: {block_lang}")
|
||||
|
||||
for line in block['lines']:
|
||||
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
||||
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
||||
@@ -119,19 +135,6 @@ def __is_list_or_index_block(block):
|
||||
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
||||
center_close_num += 1
|
||||
|
||||
line_text = ''
|
||||
|
||||
for span in line['spans']:
|
||||
span_type = span['type']
|
||||
if span_type == ContentType.Text:
|
||||
line_text += span['content'].strip()
|
||||
|
||||
# 添加所有文本,包括空行,保持与block['lines']长度一致
|
||||
lines_text_list.append(line_text)
|
||||
block_text = ''.join(lines_text_list)
|
||||
block_lang = detect_lang(block_text)
|
||||
# logger.info(f"block_lang: {block_lang}")
|
||||
|
||||
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
|
||||
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
|
||||
left_close_num += 1
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
boto3>=1.28.43
|
||||
Brotli>=1.1.0
|
||||
click>=8.1.7
|
||||
fast-langdetect>=0.2.3
|
||||
fast-langdetect>=0.2.3,<0.3.0
|
||||
loguru>=0.6.0
|
||||
numpy>=1.21.6,<2.0.0
|
||||
pydantic>=2.7.2
|
||||
|
||||
@@ -159,6 +159,30 @@
|
||||
"created_at": "2025-02-22T07:15:35Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 1743
|
||||
},
|
||||
{
|
||||
"name": "nadahlberg",
|
||||
"id": 58701810,
|
||||
"comment_id": 2676309097,
|
||||
"created_at": "2025-02-22T17:04:14Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 1748
|
||||
},
|
||||
{
|
||||
"name": "BetterAndBetterII",
|
||||
"id": 141388234,
|
||||
"comment_id": 2680567709,
|
||||
"created_at": "2025-02-25T05:21:05Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 1767
|
||||
},
|
||||
{
|
||||
"name": "luckymore",
|
||||
"id": 5390013,
|
||||
"comment_id": 2684392503,
|
||||
"created_at": "2025-02-26T09:23:25Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 1785
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user