mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
16 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
07f6c49707 | ||
|
|
1de37e4c65 | ||
|
|
bd1834284e | ||
|
|
496045f361 | ||
|
|
75478eda89 | ||
|
|
3f3edc39f5 | ||
|
|
97a4e47319 | ||
|
|
5de372245c | ||
|
|
135adac43d | ||
|
|
ba52e33527 | ||
|
|
78ed786794 | ||
|
|
4ff09a2fbc | ||
|
|
f8548a8ea2 | ||
|
|
10a95bcd05 | ||
|
|
dbdbaf58be | ||
|
|
afe92f07d6 |
29
.github/workflows/python-package.yml
vendored
29
.github/workflows/python-package.yml
vendored
@@ -11,6 +11,35 @@ on:
|
||||
|
||||
|
||||
jobs:
|
||||
|
||||
update-version:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Update version.py
|
||||
run: |
|
||||
python update_version.py
|
||||
|
||||
- name: Commit changes
|
||||
run: |
|
||||
git config --local user.email "moe@myhloli.com"
|
||||
git config --local user.name "myhloli"
|
||||
git add version.py
|
||||
git commit -m "Update version.py with new version"
|
||||
|
||||
- name: Push changes
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
||||
run: |
|
||||
git push
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import datetime
|
||||
import json
|
||||
import os, re, configparser
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import boto3
|
||||
@@ -11,6 +12,7 @@ from botocore.config import Config
|
||||
import fitz # 1.23.9中已经切换到rebase
|
||||
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
|
||||
|
||||
|
||||
def get_delta_time(input_time):
|
||||
return round(time.time() - input_time, 2)
|
||||
|
||||
|
||||
1
magic_pdf/libs/version.py
Normal file
1
magic_pdf/libs/version.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "0.0.0"
|
||||
@@ -1,7 +1,7 @@
|
||||
from sklearn.cluster import DBSCAN
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
import re
|
||||
from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
|
||||
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
|
||||
from magic_pdf.model.magic_model import MagicModel
|
||||
@@ -106,16 +106,19 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
|
||||
3. 如果非顶格,首字符大写,编码为2
|
||||
4. 如果非顶格,首字符非大写编码为3
|
||||
"""
|
||||
if len(lines) > 0:
|
||||
x_map_tag_dict, min_x_tag = cluster_line_x(lines)
|
||||
for l in lines:
|
||||
first_char = __get_span_text(l['spans'][0])[0]
|
||||
span_text = __get_span_text(l['spans'][0])
|
||||
first_char = span_text[0]
|
||||
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
|
||||
if not layout:
|
||||
line_fea_encode.append(0)
|
||||
else:
|
||||
layout_left = layout[0]
|
||||
if l['bbox'][0] == layout_left:
|
||||
#
|
||||
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
|
||||
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
|
||||
if not first_char.isalnum():
|
||||
if not first_char.isalnum() or if_match_reference_list(span_text):
|
||||
line_fea_encode.append(1)
|
||||
else:
|
||||
line_fea_encode.append(4)
|
||||
@@ -144,6 +147,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
|
||||
|
||||
return split_indices(total_lines, list_indice), list_start_idx
|
||||
|
||||
def cluster_line_x(lines: list) -> dict:
|
||||
"""
|
||||
对一个block内所有lines的bbox的x0聚类
|
||||
"""
|
||||
min_distance = 5
|
||||
min_sample = 1
|
||||
x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
|
||||
x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
|
||||
x0_uniq_label = np.unique(x0_clusters.labels_)
|
||||
#x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
|
||||
x0_2_new_val = {} # 存储旧值对应的新值映射
|
||||
min_x0 = round(lines[0]["bbox"][0])
|
||||
for label in x0_uniq_label:
|
||||
if label == -1:
|
||||
continue
|
||||
x0_index_of_label = np.where(x0_clusters.labels_ == label)
|
||||
x0_raw_val = x0_lst[x0_index_of_label][:, 0]
|
||||
x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
|
||||
x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
|
||||
if x0_new_val < min_x0:
|
||||
min_x0 = x0_new_val
|
||||
return x0_2_new_val, min_x0
|
||||
|
||||
def if_match_reference_list(text: str) -> bool:
|
||||
pattern = re.compile(r'^\d+\..*')
|
||||
if pattern.match(text):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def __valign_lines(blocks, layout_bboxes):
|
||||
"""
|
||||
@@ -315,10 +348,11 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
|
||||
"""
|
||||
for list_start in list_start_line:
|
||||
if len(list_start) > 1:
|
||||
for i in range(1, len(list_start)):
|
||||
for i in range(0, len(list_start)):
|
||||
index = list_start[i] - 1
|
||||
if "content" in lines[index]["spans"][-1]:
|
||||
lines[index]["spans"][-1]["content"] += '\n\n'
|
||||
if index >= 0:
|
||||
if "content" in lines[index]["spans"][-1]:
|
||||
lines[index]["spans"][-1]["content"] += '\n\n'
|
||||
layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
|
||||
for content_type, start, end in text_segments:
|
||||
if content_type == 'list':
|
||||
@@ -388,20 +422,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
|
||||
logger.info(f"连接page {page_num} 内的list")
|
||||
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
|
||||
may_list_lines = []
|
||||
for j in range(len(next_paras)):
|
||||
lines = next_paras[j].get("lines", [])
|
||||
if len(lines) == 1: # 只可能是一行,多行情况再需要分析了
|
||||
if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]:
|
||||
may_list_lines.append(lines[0])
|
||||
else:
|
||||
break
|
||||
lines = next_first_para.get("lines", [])
|
||||
|
||||
for line in lines:
|
||||
if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
|
||||
may_list_lines.append(line)
|
||||
else:
|
||||
break
|
||||
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
|
||||
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
|
||||
pre_last_para.extend(may_list_lines)
|
||||
blocks_group[i] = blocks_group[i][len(may_list_lines):]
|
||||
# layout_paras[i] = layout_paras[i][len(may_list_lines):]
|
||||
next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]
|
||||
|
||||
return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
|
||||
|
||||
@@ -422,18 +453,14 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
||||
logger.info(f"连接page {page_num} 内的list")
|
||||
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
|
||||
may_list_lines = []
|
||||
for j in range(len(next_page_paras[0])):
|
||||
next_page_block_j = next_page_paras[0][j]
|
||||
if next_page_block_j["type"] != BlockType.Text:
|
||||
break
|
||||
lines = next_page_block_j["lines"]
|
||||
if len(lines) == 1: # 只可能是一行,多行情况再需要分析了
|
||||
if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], next_page_layout_bbox)[0]:
|
||||
may_list_lines.append(lines[0])
|
||||
next_page_first_para = next_page_paras[0][0]
|
||||
if next_page_first_para["type"] == BlockType.Text:
|
||||
lines = next_page_first_para["lines"]
|
||||
for line in lines:
|
||||
if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
|
||||
may_list_lines.append(line)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
|
||||
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
|
||||
#pre_page_paras[-1].append(may_list_lines)
|
||||
@@ -442,7 +469,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
||||
for span in line["spans"]:
|
||||
span[CROSS_PAGE] = True
|
||||
pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
|
||||
next_page_paras[0] = next_page_paras[0][len(may_list_lines):]
|
||||
next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
|
||||
return True
|
||||
|
||||
return False
|
||||
@@ -471,7 +498,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
||||
if len(blocks_group) == 0:
|
||||
return connected_layout_blocks
|
||||
|
||||
#connected_layout_paras.append(layout_paras[0])
|
||||
connected_layout_blocks.append(blocks_group[0])
|
||||
for i in range(1, len(blocks_group)):
|
||||
try:
|
||||
@@ -484,6 +510,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
||||
if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
|
||||
connected_layout_blocks.append(blocks_group[i])
|
||||
continue
|
||||
if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
|
||||
connected_layout_blocks.append(blocks_group[i])
|
||||
continue
|
||||
pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
|
||||
next_first_line = blocks_group[i][0]["lines"][0]
|
||||
except Exception as e:
|
||||
@@ -505,7 +534,7 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
||||
|
||||
pre_last_line_text = pre_last_line_text.strip()
|
||||
next_first_line_text = next_first_line_text.strip()
|
||||
if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
|
||||
if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
|
||||
next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
|
||||
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
|
||||
#connected_layout_paras[-1][-1].extend(layout_paras[i][0])
|
||||
|
||||
@@ -16,11 +16,11 @@ import re
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.version import __version__
|
||||
from magic_pdf.rw import AbsReaderWriter
|
||||
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
|
||||
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
|
||||
|
||||
|
||||
PARSE_TYPE_TXT = "txt"
|
||||
PARSE_TYPE_OCR = "ocr"
|
||||
|
||||
@@ -39,6 +39,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
||||
|
||||
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
|
||||
|
||||
pdf_info_dict["_version_name"] = __version__
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
|
||||
@@ -57,6 +59,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
||||
|
||||
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
|
||||
|
||||
pdf_info_dict["_version_name"] = __version__
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
|
||||
@@ -88,7 +92,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
||||
for span in line['spans']:
|
||||
text_all += span['content']
|
||||
|
||||
def calculate_garbled_rate(text):
|
||||
def calculate_not_common_character_rate(text):
|
||||
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
|
||||
# 计算乱码字符的数量
|
||||
garbage_count = len(garbage_regex.findall(text))
|
||||
@@ -97,9 +101,18 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
||||
return 0 # 避免除以零的错误
|
||||
return garbage_count / total
|
||||
|
||||
garbled_rate = calculate_garbled_rate(text_all)
|
||||
def calculate_not_printable_rate(text):
|
||||
printable = sum(1 for c in text if c.isprintable())
|
||||
total = len(text)
|
||||
if total == 0:
|
||||
return 0 # 避免除以零的错误
|
||||
return (total - printable) / total
|
||||
|
||||
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or garbled_rate > 0.8:
|
||||
# not_common_character_rate = calculate_not_common_character_rate(text_all)
|
||||
not_printable_rate = calculate_not_printable_rate(text_all)
|
||||
# 测试乱码pdf,not_common_character_rate > 0.95, not_printable_rate > 0.15
|
||||
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
|
||||
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or not_printable_rate > 0.1:
|
||||
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
|
||||
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
||||
if pdf_info_dict is None:
|
||||
@@ -109,4 +122,6 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
||||
else:
|
||||
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
|
||||
|
||||
pdf_info_dict["_version_name"] = __version__
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
@@ -2,7 +2,7 @@ boto3>=1.28.43
|
||||
Brotli>=1.1.0
|
||||
click>=8.1.7
|
||||
Distance>=0.1.3
|
||||
PyMuPDF>=1.24.3
|
||||
PyMuPDF>=1.24.4
|
||||
loguru>=0.6.0
|
||||
matplotlib>=3.8.3
|
||||
numpy>=1.21.6
|
||||
|
||||
48
setup.py
48
setup.py
@@ -1,5 +1,10 @@
|
||||
from setuptools import setup, find_packages
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
from magic_pdf.libs.version import __version__
|
||||
|
||||
|
||||
def parse_requirements(filename):
|
||||
with open(filename) as f:
|
||||
lines = f.read().splitlines()
|
||||
@@ -8,37 +13,22 @@ def parse_requirements(filename):
|
||||
|
||||
for line in lines:
|
||||
if "http" in line:
|
||||
pkg_name_with_version = line.split('/')[-1].split('-')[0]
|
||||
requires.append(pkg_name_with_version)
|
||||
pkg_name_without_url = line.split('@')[0].strip()
|
||||
requires.append(pkg_name_without_url)
|
||||
else:
|
||||
requires.append(line)
|
||||
|
||||
return requires
|
||||
|
||||
def get_version():
|
||||
command = ["git", "describe", "--tags"]
|
||||
try:
|
||||
version = subprocess.check_output(command).decode().strip()
|
||||
version_parts = version.split("-")
|
||||
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
|
||||
return version_parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
requires = parse_requirements('requirements.txt')
|
||||
|
||||
setup(
|
||||
name="magic_pdf", # 项目名
|
||||
# version="0.1.3", # 版本号
|
||||
version=get_version(), # 自动从tag中获取版本号
|
||||
packages=find_packages(), # 包含所有的包
|
||||
install_requires=requires, # 项目依赖的第三方库
|
||||
python_requires=">=3.9", # 项目依赖的 Python 版本
|
||||
# entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
|
||||
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
|
||||
zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False
|
||||
)
|
||||
if __name__ == '__main__':
|
||||
setup(
|
||||
name="magic_pdf", # 项目名
|
||||
version=__version__, # 自动从tag中获取版本号
|
||||
packages=find_packages(), # 包含所有的包
|
||||
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
|
||||
python_requires=">=3.9", # 项目依赖的 Python 版本
|
||||
# entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
|
||||
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
|
||||
zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False
|
||||
)
|
||||
|
||||
22
update_version.py
Normal file
22
update_version.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
|
||||
def get_version():
|
||||
command = ["git", "describe", "--tags"]
|
||||
try:
|
||||
version = subprocess.check_output(command).decode().strip()
|
||||
version_parts = version.split("-")
|
||||
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
|
||||
return version_parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
def write_version_to_commons(version):
|
||||
commons_path = os.path.join(os.path.dirname(__file__), 'magic_pdf', 'libs', 'version.py')
|
||||
with open(commons_path, 'w') as f:
|
||||
f.write(f'__version__ = "{version}"\n')
|
||||
Reference in New Issue
Block a user