mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
78 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f313bd0b4 | ||
|
|
3b7342b894 | ||
|
|
9dc5033cf7 | ||
|
|
389826c5fe | ||
|
|
c96aa88d13 | ||
|
|
738f9274a9 | ||
|
|
084dc22ab1 | ||
|
|
6c52856d2a | ||
|
|
c69f414b20 | ||
|
|
0306d66d25 | ||
|
|
35d39735da | ||
|
|
e57a9d87c7 | ||
|
|
ce0d99057a | ||
|
|
0606301412 | ||
|
|
39b46ea980 | ||
|
|
aeef64b482 | ||
|
|
d2e8271322 | ||
|
|
d62dd24939 | ||
|
|
0c33f2f0d3 | ||
|
|
64c628434d | ||
|
|
a5ff8acea7 | ||
|
|
0b97f26552 | ||
|
|
2284e0d77b | ||
|
|
f80560ffea | ||
|
|
5aa2e01264 | ||
|
|
384c979d68 | ||
|
|
2ad0134ca2 | ||
|
|
0678a8603d | ||
|
|
9c6cb7b772 | ||
|
|
bf18172d8a | ||
|
|
e92de75844 | ||
|
|
b7a418b538 | ||
|
|
3c145ba0ca | ||
|
|
999b698fca | ||
|
|
9b5b116369 | ||
|
|
c50fa4dc72 | ||
|
|
54f31b65cb | ||
|
|
4ce15c44f3 | ||
|
|
88f2245d86 | ||
|
|
bc05526602 | ||
|
|
b18e9365fa | ||
|
|
48b6992b71 | ||
|
|
4f6171d19e | ||
|
|
595517054b | ||
|
|
705c4dcf30 | ||
|
|
ff52be3304 | ||
|
|
a68f4174cd | ||
|
|
2d0d5a8208 | ||
|
|
887a3d989b | ||
|
|
6ab1a65a6a | ||
|
|
48d3032318 | ||
|
|
ddde1b82f2 | ||
|
|
c7a685b302 | ||
|
|
93a59ff4a3 | ||
|
|
ab8413811f | ||
|
|
e73964fc12 | ||
|
|
b74f17e439 | ||
|
|
20278040a5 | ||
|
|
9d0b4e95de | ||
|
|
7fd8d97edb | ||
|
|
1877055672 | ||
|
|
75d0fa3d24 | ||
|
|
07f6c49707 | ||
|
|
1de37e4c65 | ||
|
|
bd1834284e | ||
|
|
496045f361 | ||
|
|
75478eda89 | ||
|
|
3f3edc39f5 | ||
|
|
97a4e47319 | ||
|
|
5de372245c | ||
|
|
135adac43d | ||
|
|
ba52e33527 | ||
|
|
78ed786794 | ||
|
|
4ff09a2fbc | ||
|
|
f8548a8ea2 | ||
|
|
10a95bcd05 | ||
|
|
dbdbaf58be | ||
|
|
afe92f07d6 |
61
.github/workflows/python-package.yml
vendored
61
.github/workflows/python-package.yml
vendored
@@ -11,8 +11,51 @@ on:
|
||||
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
update-version:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: master
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Update version.py
|
||||
run: |
|
||||
python update_version.py
|
||||
|
||||
- name: Verify version.py
|
||||
run: |
|
||||
ls -l magic_pdf/libs/version.py
|
||||
cat magic_pdf/libs/version.py
|
||||
|
||||
- name: Commit changes
|
||||
run: |
|
||||
git config --local user.email "moe@myhloli.com"
|
||||
git config --local user.name "myhloli"
|
||||
git add magic_pdf/libs/version.py
|
||||
if git diff-index --quiet HEAD; then
|
||||
echo "No changes to commit"
|
||||
else
|
||||
git commit -m "Update version.py with new version"
|
||||
fi
|
||||
id: commit_changes
|
||||
|
||||
- name: Push changes
|
||||
if: steps.commit_changes.outcome == 'success'
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
||||
run: |
|
||||
git push origin HEAD:master
|
||||
|
||||
build:
|
||||
needs: [ update-version ]
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -23,8 +66,14 @@ jobs:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: master
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Verify version.py
|
||||
run: |
|
||||
ls -l magic_pdf/libs/version.py
|
||||
cat magic_pdf/libs/version.py
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
@@ -70,8 +119,8 @@ jobs:
|
||||
files: './dist/*.whl'
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
||||
# - name: Publish to PyPI
|
||||
# uses: pypa/gh-action-pypi-publish@release/v1
|
||||
# with:
|
||||
# user: __token__
|
||||
# password: ${{ secrets.PYPI_TOKEN }}
|
||||
|
||||
- name: Publish distribution to PyPI
|
||||
run: |
|
||||
pip install twine
|
||||
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
|
||||
|
||||
@@ -27,6 +27,7 @@ import sys
|
||||
import click
|
||||
from loguru import logger
|
||||
from pathlib import Path
|
||||
from magic_pdf.libs.version import __version__
|
||||
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
|
||||
@@ -43,6 +44,7 @@ from magic_pdf.libs.config_reader import get_local_dir
|
||||
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
|
||||
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
||||
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
||||
import csv
|
||||
|
||||
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
|
||||
|
||||
@@ -52,13 +54,22 @@ def prepare_env(pdf_file_name, method):
|
||||
get_local_dir(), "magic-pdf", pdf_file_name, method
|
||||
)
|
||||
|
||||
local_image_dir = os.path.join(local_parent_dir, "images")
|
||||
local_image_dir = os.path.join(str(local_parent_dir), "images")
|
||||
local_md_dir = local_parent_dir
|
||||
os.makedirs(local_image_dir, exist_ok=True)
|
||||
os.makedirs(local_md_dir, exist_ok=True)
|
||||
return local_image_dir, local_md_dir
|
||||
|
||||
|
||||
def write_to_csv(csv_file_path, csv_data):
|
||||
with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
|
||||
# 创建csv writer对象
|
||||
csv_writer = csv.writer(csvfile)
|
||||
# 写入数据
|
||||
csv_writer.writerow(csv_data)
|
||||
print(f"数据已成功追加到 '{csv_file_path}'")
|
||||
|
||||
|
||||
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
|
||||
if parse_method == "auto":
|
||||
jso_useful_key = {
|
||||
@@ -75,28 +86,54 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
|
||||
sys.exit(1)
|
||||
|
||||
pipe.pipe_classify()
|
||||
|
||||
'''如果没有传入有效的模型数据,则使用内置paddle解析'''
|
||||
if len(model_list) == 0:
|
||||
pipe.pipe_analyze()
|
||||
|
||||
pipe.pipe_parse()
|
||||
pdf_info = pipe.pdf_mid_data['pdf_info']
|
||||
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
|
||||
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
|
||||
|
||||
# write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
|
||||
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
|
||||
|
||||
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
|
||||
'''写markdown'''
|
||||
md_writer.write(
|
||||
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
|
||||
)
|
||||
'''写middle_json'''
|
||||
md_writer.write(
|
||||
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
|
||||
path=f"{pdf_file_name}.json",
|
||||
path=f"{pdf_file_name}_middle.json",
|
||||
mode=AbsReaderWriter.MODE_TXT,
|
||||
)
|
||||
|
||||
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
||||
'''写model_json'''
|
||||
md_writer.write(
|
||||
str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
|
||||
content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
|
||||
path=f"{pdf_file_name}_model.json",
|
||||
mode=AbsReaderWriter.MODE_TXT,
|
||||
)
|
||||
'''写源pdf'''
|
||||
md_writer.write(
|
||||
content=pdf_bytes,
|
||||
path=f"{pdf_file_name}_origin.pdf",
|
||||
mode=AbsReaderWriter.MODE_BIN,
|
||||
)
|
||||
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
||||
'''写content_list'''
|
||||
md_writer.write(
|
||||
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
|
||||
path=f"{pdf_file_name}_content_list.json",
|
||||
mode=AbsReaderWriter.MODE_TXT
|
||||
)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
|
||||
@click.help_option("--help", "-h", help="显示帮助信息")
|
||||
def cli():
|
||||
pass
|
||||
|
||||
@@ -141,7 +178,7 @@ def json_command(json, method):
|
||||
pdf_file_name = Path(s3_file_path).stem
|
||||
pdf_data = read_s3_path(s3_file_path)
|
||||
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
||||
|
||||
|
||||
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
||||
local_md_dir
|
||||
)
|
||||
@@ -158,60 +195,60 @@ def json_command(json, method):
|
||||
)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
|
||||
@click.option(
|
||||
"--method",
|
||||
type=parse_pdf_methods,
|
||||
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
||||
default="auto",
|
||||
)
|
||||
def local_json_command(local_json, method):
|
||||
def read_s3_path(s3path):
|
||||
bucket, key = parse_s3path(s3path)
|
||||
@cli.command()
|
||||
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
|
||||
@click.option(
|
||||
"--method",
|
||||
type=parse_pdf_methods,
|
||||
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
||||
default="auto",
|
||||
)
|
||||
def local_json_command(local_json, method):
|
||||
def read_s3_path(s3path):
|
||||
bucket, key = parse_s3path(s3path)
|
||||
|
||||
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
||||
s3_rw = S3ReaderWriter(
|
||||
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
|
||||
)
|
||||
may_range_params = parse_s3_range_params(s3path)
|
||||
if may_range_params is None or 2 != len(may_range_params):
|
||||
byte_start, byte_end = 0, None
|
||||
else:
|
||||
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
|
||||
byte_end += byte_start - 1
|
||||
return s3_rw.read_jsonl(
|
||||
remove_non_official_s3_args(s3path),
|
||||
byte_start,
|
||||
byte_end,
|
||||
AbsReaderWriter.MODE_BIN,
|
||||
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
||||
s3_rw = S3ReaderWriter(
|
||||
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
|
||||
)
|
||||
may_range_params = parse_s3_range_params(s3path)
|
||||
if may_range_params is None or 2 != len(may_range_params):
|
||||
byte_start, byte_end = 0, None
|
||||
else:
|
||||
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
|
||||
byte_end += byte_start - 1
|
||||
return s3_rw.read_jsonl(
|
||||
remove_non_official_s3_args(s3path),
|
||||
byte_start,
|
||||
byte_end,
|
||||
AbsReaderWriter.MODE_BIN,
|
||||
)
|
||||
|
||||
with open(local_json, "r", encoding="utf-8") as f:
|
||||
for json_line in f:
|
||||
jso = json_parse.loads(json_line)
|
||||
|
||||
s3_file_path = jso.get("file_location")
|
||||
if s3_file_path is None:
|
||||
s3_file_path = jso.get("path")
|
||||
pdf_file_name = Path(s3_file_path).stem
|
||||
pdf_data = read_s3_path(s3_file_path)
|
||||
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
||||
|
||||
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
||||
local_md_dir
|
||||
)
|
||||
|
||||
with open(local_json, "r", encoding="utf-8") as f:
|
||||
for json_line in f:
|
||||
jso = json_parse.loads(json_line)
|
||||
|
||||
s3_file_path = jso.get("file_location")
|
||||
if s3_file_path is None:
|
||||
s3_file_path = jso.get("path")
|
||||
pdf_file_name = Path(s3_file_path).stem
|
||||
pdf_data = read_s3_path(s3_file_path)
|
||||
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
||||
|
||||
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
||||
local_md_dir
|
||||
)
|
||||
|
||||
_do_parse(
|
||||
pdf_file_name,
|
||||
pdf_data,
|
||||
jso["doc_layout_result"],
|
||||
method,
|
||||
local_image_rw,
|
||||
local_md_rw,
|
||||
os.path.basename(local_image_dir),
|
||||
local_md_dir
|
||||
)
|
||||
_do_parse(
|
||||
pdf_file_name,
|
||||
pdf_data,
|
||||
jso["doc_layout_result"],
|
||||
method,
|
||||
local_image_rw,
|
||||
local_md_rw,
|
||||
os.path.basename(local_image_dir),
|
||||
local_md_dir
|
||||
)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@@ -226,19 +263,28 @@ def json_command(json, method):
|
||||
default="auto",
|
||||
)
|
||||
def pdf_command(pdf, model, method):
|
||||
# 这里处理pdf和模型相关的逻辑
|
||||
if model is None:
|
||||
model = pdf.replace(".pdf", ".json")
|
||||
if not os.path.exists(model):
|
||||
print(f"make sure json {model} existed and place under {os.path.dirname(pdf)}", file=sys.stderr)
|
||||
exit(1)
|
||||
|
||||
def read_fn(path):
|
||||
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
||||
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
||||
|
||||
pdf_data = read_fn(pdf)
|
||||
jso = json_parse.loads(read_fn(model).decode("utf-8"))
|
||||
|
||||
def get_model_json(model_path):
|
||||
# 这里处理pdf和模型相关的逻辑
|
||||
if model_path is None:
|
||||
model_path = pdf.replace(".pdf", ".json")
|
||||
if not os.path.exists(model_path):
|
||||
logger.warning(f"not found json {model_path} existed, use paddle analyze")
|
||||
# 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
|
||||
model_json = "[]"
|
||||
else:
|
||||
model_json = read_fn(model_path).decode("utf-8")
|
||||
else:
|
||||
model_json = read_fn(model_path).decode("utf-8")
|
||||
|
||||
return model_json
|
||||
|
||||
jso = json_parse.loads(get_model_json(model))
|
||||
pdf_file_name = Path(pdf).stem
|
||||
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
||||
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
||||
|
||||
@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
|
||||
from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
|
||||
|
||||
TEXT_LEN_THRESHOLD = 100
|
||||
AVG_TEXT_LEN_THRESHOLD = 200
|
||||
AVG_TEXT_LEN_THRESHOLD = 100
|
||||
TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计
|
||||
|
||||
|
||||
@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
|
||||
# 如果宽达标,检测是否能竖着拼
|
||||
if full_width:
|
||||
# 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
|
||||
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
|
||||
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
|
||||
last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
|
||||
|
||||
# 如果高达标,检测是否可以横着拼
|
||||
if full_height:
|
||||
# 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
|
||||
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
|
||||
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
|
||||
last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
|
||||
|
||||
# Check if the image can be merged with the last image
|
||||
if (full_width and close1) or (full_height and close2):
|
||||
@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
|
||||
# 先对每个id出现的次数做个统计
|
||||
objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
|
||||
# 再去掉出现次数大于10的
|
||||
if total_page >= scan_max_page:# 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
|
||||
if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
|
||||
total_page = scan_max_page
|
||||
|
||||
|
||||
repeat_threshold = 2 # 把bad_image的阈值设为2
|
||||
# repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img
|
||||
bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
|
||||
@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
|
||||
# if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值
|
||||
# return True
|
||||
|
||||
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in img_sz_list] # 过滤掉重复出现的图片
|
||||
|
||||
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
|
||||
img_sz_list] # 过滤掉重复出现的图片
|
||||
|
||||
# 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
|
||||
img_sz_list = merge_images(img_sz_list, page_width, page_height)
|
||||
|
||||
# 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
|
||||
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in img_sz_list]
|
||||
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
|
||||
img_sz_list]
|
||||
page_area = page_width * page_height
|
||||
max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
|
||||
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
|
||||
|
||||
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
|
||||
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
|
||||
# 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
|
||||
def classify_by_text_len(text_len_list: list, total_page: int):
|
||||
"""
|
||||
随机抽取10%的页面,如果少于5个页面,那么就取全部页面。
|
||||
@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
|
||||
is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
|
||||
return is_text_pdf
|
||||
|
||||
|
||||
def classify_by_avg_words(text_len_list: list):
|
||||
"""
|
||||
补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf
|
||||
@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):
|
||||
|
||||
return is_text_pdf
|
||||
|
||||
|
||||
def classify_by_img_num(img_sz_list: list, img_num_list: list):
|
||||
"""
|
||||
补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重,
|
||||
@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
|
||||
# img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min
|
||||
if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
|
||||
|
||||
#拿max和min的值,用来判断list内的值是否全都相等
|
||||
# min_imgs = min(img_num_list)
|
||||
# max_imgs = max(img_num_list)
|
||||
#
|
||||
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
|
||||
#拿max和min的值,用来判断list内的值是否全都相等
|
||||
# min_imgs = min(img_num_list)
|
||||
# max_imgs = max(img_num_list)
|
||||
#
|
||||
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
|
||||
return False # 如果满足这个条件,一定不是文字版pdf
|
||||
else:
|
||||
return True # 不满足这三个条件,可能是文字版pdf,通过其他规则判断
|
||||
@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
|
||||
else:
|
||||
return False # 文本布局未知,默认认为不是文字版pdf
|
||||
|
||||
|
||||
def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
||||
"""
|
||||
判断一页是否由细长条组成,有两个条件:
|
||||
@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
||||
Returns:
|
||||
bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False
|
||||
"""
|
||||
|
||||
def is_narrow_strip(img):
|
||||
x0, y0, x1, y1, _ = img
|
||||
width, height = x1 - x0, y1 - y0
|
||||
@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
||||
return narrow_strip_pages_ratio < 0.5
|
||||
|
||||
|
||||
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
|
||||
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
|
||||
text_layout_list: list):
|
||||
"""
|
||||
这里的图片和页面长度单位是pts
|
||||
:param total_page:
|
||||
@@ -324,7 +330,9 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
|
||||
elif not any(results.values()):
|
||||
return False, results
|
||||
else:
|
||||
logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
|
||||
logger.warning(
|
||||
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
|
||||
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
|
||||
return False, results
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import datetime
|
||||
import json
|
||||
import os, re, configparser
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import boto3
|
||||
@@ -11,6 +12,7 @@ from botocore.config import Config
|
||||
import fitz # 1.23.9中已经切换到rebase
|
||||
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
|
||||
|
||||
|
||||
def get_delta_time(input_time):
|
||||
return round(time.time() - input_time, 2)
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import pycld2 as cld2
|
||||
import regex
|
||||
import unicodedata
|
||||
|
||||
from fast_langdetect import detect_langs
|
||||
|
||||
RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
|
||||
|
||||
@@ -13,17 +12,13 @@ def remove_bad_chars(text):
|
||||
def detect_lang(text: str) -> str:
|
||||
if len(text) == 0:
|
||||
return ""
|
||||
|
||||
try:
|
||||
_, _, details = cld2.detect(text)
|
||||
lang_upper = detect_langs(text)
|
||||
except:
|
||||
# cld2 doesn't like control characters
|
||||
# https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
|
||||
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
|
||||
_, _, details = cld2.detect(html_no_ctrl_chars)
|
||||
lang = ""
|
||||
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
|
||||
lang_upper = detect_langs(html_no_ctrl_chars)
|
||||
try:
|
||||
lang = details[0][1].lower()
|
||||
lang = lang_upper.lower()
|
||||
except:
|
||||
lang = ""
|
||||
return lang
|
||||
@@ -33,4 +28,4 @@ if __name__ == '__main__':
|
||||
print(detect_lang("This is a test."))
|
||||
print(detect_lang("<html>This is a test</html>"))
|
||||
print(detect_lang("这个是中文测试。"))
|
||||
print(detect_lang("<html>这个是中文测试。</html>"))
|
||||
print(detect_lang("<html>这个是中文测试。</html>"))
|
||||
|
||||
1
magic_pdf/libs/version.py
Normal file
1
magic_pdf/libs/version.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "0.5.4"
|
||||
8
magic_pdf/model/360_layout_analysis.py
Normal file
8
magic_pdf/model/360_layout_analysis.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from ultralytics import YOLO
|
||||
|
||||
image_path = '' # 待预测图片路径
|
||||
model_path = '' # 权重路径
|
||||
model = YOLO(model_path)
|
||||
|
||||
result = model(image_path, save=True, conf=0.5, save_crop=False, line_width=2)
|
||||
print(result)
|
||||
61
magic_pdf/model/doc_analyze_by_custom_model.py
Normal file
61
magic_pdf/model/doc_analyze_by_custom_model.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import fitz
|
||||
import cv2
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
from magic_pdf.model.model_list import MODEL
|
||||
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
|
||||
|
||||
|
||||
def dict_compare(d1, d2):
|
||||
return d1.items() == d2.items()
|
||||
|
||||
|
||||
def remove_duplicates_dicts(lst):
|
||||
unique_dicts = []
|
||||
for dict_item in lst:
|
||||
if not any(
|
||||
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
|
||||
):
|
||||
unique_dicts.append(dict_item)
|
||||
return unique_dicts
|
||||
|
||||
|
||||
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
||||
images = []
|
||||
with fitz.open("pdf", pdf_bytes) as doc:
|
||||
for index in range(0, doc.page_count):
|
||||
page = doc[index]
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pm = page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# if width or height > 2000 pixels, don't enlarge the image
|
||||
# if pm.width > 2000 or pm.height > 2000:
|
||||
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
||||
|
||||
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
||||
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
||||
img_dict = {"img": img, "width": pm.width, "height": pm.height}
|
||||
images.append(img_dict)
|
||||
return images
|
||||
|
||||
|
||||
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
|
||||
images = load_images_from_pdf(pdf_bytes)
|
||||
custom_model = None
|
||||
if model == MODEL.Paddle:
|
||||
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
|
||||
else:
|
||||
pass
|
||||
model_json = []
|
||||
for index, img_dict in enumerate(images):
|
||||
img = img_dict["img"]
|
||||
page_width = img_dict["width"]
|
||||
page_height = img_dict["height"]
|
||||
result = custom_model(img)
|
||||
page_info = {"page_no": index, "height": page_height, "width": page_width}
|
||||
page_dict = {"layout_dets": result, "page_info": page_info}
|
||||
|
||||
model_json.append(page_dict)
|
||||
|
||||
return model_json
|
||||
@@ -37,7 +37,14 @@ class MagicModel:
|
||||
)
|
||||
layout_dets = model_page_info["layout_dets"]
|
||||
for layout_det in layout_dets:
|
||||
x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
|
||||
|
||||
if layout_det.get("bbox") is not None:
|
||||
# 兼容直接输出bbox的模型数据,如paddle
|
||||
x0, y0, x1, y1 = layout_det["bbox"]
|
||||
else:
|
||||
# 兼容直接输出poly的模型数据,如xxx
|
||||
x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
|
||||
|
||||
bbox = [
|
||||
int(x0 / horizontal_scale_ratio),
|
||||
int(y0 / vertical_scale_ratio),
|
||||
|
||||
2
magic_pdf/model/model_list.py
Normal file
2
magic_pdf/model/model_list.py
Normal file
@@ -0,0 +1,2 @@
|
||||
class MODEL:
|
||||
Paddle = "pp_structure_v2"
|
||||
75
magic_pdf/model/pp_structure_v2.py
Normal file
75
magic_pdf/model/pp_structure_v2.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import random
|
||||
|
||||
from loguru import logger
|
||||
from paddleocr import PPStructure
|
||||
|
||||
|
||||
def region_to_bbox(region):
|
||||
x0 = region[0][0]
|
||||
y0 = region[0][1]
|
||||
x1 = region[2][0]
|
||||
y1 = region[2][1]
|
||||
return [x0, y0, x1, y1]
|
||||
|
||||
|
||||
class CustomPaddleModel:
|
||||
def __init__(self, ocr: bool = False, show_log: bool = False):
|
||||
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
|
||||
|
||||
def __call__(self, img):
|
||||
result = self.model(img)
|
||||
spans = []
|
||||
for line in result:
|
||||
line.pop("img")
|
||||
"""
|
||||
为paddle输出适配type no.
|
||||
title: 0 # 标题
|
||||
text: 1 # 文本
|
||||
header: 2 # abandon
|
||||
footer: 2 # abandon
|
||||
reference: 1 # 文本 or abandon
|
||||
equation: 8 # 行间公式 block
|
||||
equation: 14 # 行间公式 text
|
||||
figure: 3 # 图片
|
||||
figure_caption: 4 # 图片描述
|
||||
table: 5 # 表格
|
||||
table_caption: 6 # 表格描述
|
||||
"""
|
||||
if line["type"] == "title":
|
||||
line["category_id"] = 0
|
||||
elif line["type"] in ["text", "reference"]:
|
||||
line["category_id"] = 1
|
||||
elif line["type"] == "figure":
|
||||
line["category_id"] = 3
|
||||
elif line["type"] == "figure_caption":
|
||||
line["category_id"] = 4
|
||||
elif line["type"] == "table":
|
||||
line["category_id"] = 5
|
||||
elif line["type"] == "table_caption":
|
||||
line["category_id"] = 6
|
||||
elif line["type"] == "equation":
|
||||
line["category_id"] = 8
|
||||
elif line["type"] in ["header", "footer"]:
|
||||
line["category_id"] = 2
|
||||
else:
|
||||
logger.warning(f"unknown type: {line['type']}")
|
||||
|
||||
# 兼容不输出score的paddleocr版本
|
||||
if line.get("score") is None:
|
||||
line["score"] = 0.5 + random.random() * 0.5
|
||||
|
||||
res = line.pop("res", None)
|
||||
if res is not None and len(res) > 0:
|
||||
for span in res:
|
||||
new_span = {
|
||||
"category_id": 15,
|
||||
"bbox": region_to_bbox(span["text_region"]),
|
||||
"score": span["confidence"],
|
||||
"text": span["text"],
|
||||
}
|
||||
spans.append(new_span)
|
||||
|
||||
if len(spans) > 0:
|
||||
result.extend(spans)
|
||||
|
||||
return result
|
||||
@@ -1,7 +1,7 @@
|
||||
from sklearn.cluster import DBSCAN
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
import re
|
||||
from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
|
||||
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
|
||||
from magic_pdf.model.magic_model import MagicModel
|
||||
@@ -106,16 +106,19 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
|
||||
3. 如果非顶格,首字符大写,编码为2
|
||||
4. 如果非顶格,首字符非大写编码为3
|
||||
"""
|
||||
if len(lines) > 0:
|
||||
x_map_tag_dict, min_x_tag = cluster_line_x(lines)
|
||||
for l in lines:
|
||||
first_char = __get_span_text(l['spans'][0])[0]
|
||||
span_text = __get_span_text(l['spans'][0])
|
||||
first_char = span_text[0]
|
||||
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
|
||||
if not layout:
|
||||
line_fea_encode.append(0)
|
||||
else:
|
||||
layout_left = layout[0]
|
||||
if l['bbox'][0] == layout_left:
|
||||
#
|
||||
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
|
||||
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
|
||||
if not first_char.isalnum():
|
||||
if not first_char.isalnum() or if_match_reference_list(span_text):
|
||||
line_fea_encode.append(1)
|
||||
else:
|
||||
line_fea_encode.append(4)
|
||||
@@ -144,6 +147,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
|
||||
|
||||
return split_indices(total_lines, list_indice), list_start_idx
|
||||
|
||||
def cluster_line_x(lines: list) -> dict:
|
||||
"""
|
||||
对一个block内所有lines的bbox的x0聚类
|
||||
"""
|
||||
min_distance = 5
|
||||
min_sample = 1
|
||||
x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
|
||||
x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
|
||||
x0_uniq_label = np.unique(x0_clusters.labels_)
|
||||
#x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
|
||||
x0_2_new_val = {} # 存储旧值对应的新值映射
|
||||
min_x0 = round(lines[0]["bbox"][0])
|
||||
for label in x0_uniq_label:
|
||||
if label == -1:
|
||||
continue
|
||||
x0_index_of_label = np.where(x0_clusters.labels_ == label)
|
||||
x0_raw_val = x0_lst[x0_index_of_label][:, 0]
|
||||
x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
|
||||
x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
|
||||
if x0_new_val < min_x0:
|
||||
min_x0 = x0_new_val
|
||||
return x0_2_new_val, min_x0
|
||||
|
||||
def if_match_reference_list(text: str) -> bool:
|
||||
pattern = re.compile(r'^\d+\..*')
|
||||
if pattern.match(text):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def __valign_lines(blocks, layout_bboxes):
|
||||
"""
|
||||
@@ -298,7 +331,7 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
|
||||
block["type"] == BlockType.Text for line in
|
||||
block['lines']]
|
||||
total_lines = len(lines)
|
||||
if total_lines == 1:
|
||||
if total_lines == 1 or total_lines == 0:
|
||||
list_info.append([False, False])
|
||||
continue
|
||||
"""在进入到真正的分段之前,要对文字块从统计维度进行对齐方式的探测,
|
||||
@@ -315,10 +348,11 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
|
||||
"""
|
||||
for list_start in list_start_line:
|
||||
if len(list_start) > 1:
|
||||
for i in range(1, len(list_start)):
|
||||
for i in range(0, len(list_start)):
|
||||
index = list_start[i] - 1
|
||||
if "content" in lines[index]["spans"][-1]:
|
||||
lines[index]["spans"][-1]["content"] += '\n\n'
|
||||
if index >= 0:
|
||||
if "content" in lines[index]["spans"][-1]:
|
||||
lines[index]["spans"][-1]["content"] += '\n\n'
|
||||
layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
|
||||
for content_type, start, end in text_segments:
|
||||
if content_type == 'list':
|
||||
@@ -388,20 +422,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
|
||||
logger.info(f"连接page {page_num} 内的list")
|
||||
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
|
||||
may_list_lines = []
|
||||
for j in range(len(next_paras)):
|
||||
lines = next_paras[j].get("lines", [])
|
||||
if len(lines) == 1: # 只可能是一行,多行情况再需要分析了
|
||||
if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]:
|
||||
may_list_lines.append(lines[0])
|
||||
else:
|
||||
break
|
||||
lines = next_first_para.get("lines", [])
|
||||
|
||||
for line in lines:
|
||||
if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
|
||||
may_list_lines.append(line)
|
||||
else:
|
||||
break
|
||||
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
|
||||
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
|
||||
pre_last_para.extend(may_list_lines)
|
||||
blocks_group[i] = blocks_group[i][len(may_list_lines):]
|
||||
# layout_paras[i] = layout_paras[i][len(may_list_lines):]
|
||||
next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]
|
||||
|
||||
return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
|
||||
|
||||
@@ -422,18 +453,14 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
||||
logger.info(f"连接page {page_num} 内的list")
|
||||
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
|
||||
may_list_lines = []
|
||||
for j in range(len(next_page_paras[0])):
|
||||
next_page_block_j = next_page_paras[0][j]
|
||||
if next_page_block_j["type"] != BlockType.Text:
|
||||
break
|
||||
lines = next_page_block_j["lines"]
|
||||
if len(lines) == 1: # 只可能是一行,多行情况再需要分析了
|
||||
if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], next_page_layout_bbox)[0]:
|
||||
may_list_lines.append(lines[0])
|
||||
next_page_first_para = next_page_paras[0][0]
|
||||
if next_page_first_para["type"] == BlockType.Text:
|
||||
lines = next_page_first_para["lines"]
|
||||
for line in lines:
|
||||
if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
|
||||
may_list_lines.append(line)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
|
||||
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
|
||||
#pre_page_paras[-1].append(may_list_lines)
|
||||
@@ -442,7 +469,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
||||
for span in line["spans"]:
|
||||
span[CROSS_PAGE] = True
|
||||
pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
|
||||
next_page_paras[0] = next_page_paras[0][len(may_list_lines):]
|
||||
next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
|
||||
return True
|
||||
|
||||
return False
|
||||
@@ -471,7 +498,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
||||
if len(blocks_group) == 0:
|
||||
return connected_layout_blocks
|
||||
|
||||
#connected_layout_paras.append(layout_paras[0])
|
||||
connected_layout_blocks.append(blocks_group[0])
|
||||
for i in range(1, len(blocks_group)):
|
||||
try:
|
||||
@@ -484,6 +510,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
||||
if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
|
||||
connected_layout_blocks.append(blocks_group[i])
|
||||
continue
|
||||
if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
|
||||
connected_layout_blocks.append(blocks_group[i])
|
||||
continue
|
||||
pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
|
||||
next_first_line = blocks_group[i][0]["lines"][0]
|
||||
except Exception as e:
|
||||
@@ -505,7 +534,7 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
||||
|
||||
pre_last_line_text = pre_last_line_text.strip()
|
||||
next_first_line_text = next_first_line_text.strip()
|
||||
if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
|
||||
if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
|
||||
next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
|
||||
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
|
||||
#connected_layout_paras[-1][-1].extend(layout_paras[i][0])
|
||||
@@ -557,8 +586,15 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
||||
# 不是文本,不连接
|
||||
return False
|
||||
|
||||
pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)[2]
|
||||
next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)[0]
|
||||
pre_x2_max_bbox = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)
|
||||
if not pre_x2_max_bbox:
|
||||
return False
|
||||
next_x0_min_bbox = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)
|
||||
if not next_x0_min_bbox:
|
||||
return False
|
||||
|
||||
pre_x2_max = pre_x2_max_bbox[2]
|
||||
next_x0_min = next_x0_min_bbox[0]
|
||||
|
||||
pre_last_line_text = pre_last_line_text.strip()
|
||||
next_first_line_text = next_first_line_text.strip()
|
||||
|
||||
@@ -111,6 +111,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
|
||||
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
|
||||
|
||||
'''将所有区块的bbox整理到一起'''
|
||||
# @todo interline_equation_blocks参数不够准,后面切换到interline_equations上
|
||||
if len(interline_equation_blocks) > 0:
|
||||
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
|
||||
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
|
||||
|
||||
@@ -33,6 +33,13 @@ class AbsPipe(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def pipe_analyze(self):
|
||||
"""
|
||||
有状态的跑模型分析
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def pipe_parse(self):
|
||||
"""
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
||||
from magic_pdf.libs.json_compressor import JsonCompressor
|
||||
from magic_pdf.pipe.AbsPipe import AbsPipe
|
||||
from magic_pdf.user_api import parse_ocr_pdf
|
||||
|
||||
@@ -13,6 +13,9 @@ class OCRPipe(AbsPipe):
|
||||
def pipe_classify(self):
|
||||
pass
|
||||
|
||||
def pipe_analyze(self):
|
||||
self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
|
||||
|
||||
def pipe_parse(self):
|
||||
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
||||
from magic_pdf.libs.json_compressor import JsonCompressor
|
||||
from magic_pdf.pipe.AbsPipe import AbsPipe
|
||||
@@ -13,6 +14,9 @@ class TXTPipe(AbsPipe):
|
||||
def pipe_classify(self):
|
||||
pass
|
||||
|
||||
def pipe_analyze(self):
|
||||
self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
|
||||
|
||||
def pipe_parse(self):
|
||||
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import json
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
||||
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
||||
from magic_pdf.libs.commons import join_path
|
||||
@@ -15,14 +16,24 @@ class UNIPipe(AbsPipe):
|
||||
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
|
||||
self.pdf_type = jso_useful_key["_pdf_type"]
|
||||
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
|
||||
if len(self.model_list) == 0:
|
||||
self.input_model_is_empty = True
|
||||
else:
|
||||
self.input_model_is_empty = False
|
||||
|
||||
def pipe_classify(self):
|
||||
self.pdf_type = AbsPipe.classify(self.pdf_bytes)
|
||||
|
||||
def pipe_analyze(self):
|
||||
if self.pdf_type == self.PIP_TXT:
|
||||
self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
|
||||
elif self.pdf_type == self.PIP_OCR:
|
||||
self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
|
||||
|
||||
def pipe_parse(self):
|
||||
if self.pdf_type == self.PIP_TXT:
|
||||
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
||||
is_debug=self.is_debug)
|
||||
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty)
|
||||
elif self.pdf_type == self.PIP_OCR:
|
||||
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
||||
is_debug=self.is_debug)
|
||||
|
||||
@@ -107,6 +107,7 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
|
||||
or y0_1 > y1_2
|
||||
) # box1在box2的下边
|
||||
|
||||
|
||||
def remove_text_block_overlap_interline_equation_bbox(
|
||||
interline_eq_bboxes, pymu_block_list
|
||||
):
|
||||
@@ -122,10 +123,10 @@ def remove_text_block_overlap_interline_equation_bbox(
|
||||
deleted_chars = []
|
||||
for char in span["chars"]:
|
||||
if any(
|
||||
[
|
||||
_is_in_or_part_overlap(char["bbox"], eq_bbox["bbox"])
|
||||
for eq_bbox in interline_eq_bboxes
|
||||
]
|
||||
[
|
||||
(calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
|
||||
for eq_bbox in interline_eq_bboxes
|
||||
]
|
||||
):
|
||||
deleted_chars.append(char)
|
||||
# 检查span里没有char则删除这个span
|
||||
|
||||
@@ -36,6 +36,9 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
|
||||
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
|
||||
'''任何框体与舍弃框重叠,优先信任舍弃框'''
|
||||
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
|
||||
# @todo interline_equation 与title或text框冲突的情况,分两种情况处理
|
||||
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
|
||||
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
|
||||
|
||||
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
|
||||
for discarded in discarded_blocks:
|
||||
|
||||
@@ -160,12 +160,12 @@ def fill_spans_in_blocks(blocks, spans, radio):
|
||||
block_spans.append(span)
|
||||
|
||||
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
|
||||
displayed_list = []
|
||||
text_inline_lines = []
|
||||
modify_y_axis(block_spans, displayed_list, text_inline_lines)
|
||||
# displayed_list = []
|
||||
# text_inline_lines = []
|
||||
# modify_y_axis(block_spans, displayed_list, text_inline_lines)
|
||||
|
||||
'''模型识别错误的行间公式, type类型转换成行内公式'''
|
||||
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
|
||||
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
|
||||
|
||||
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
|
||||
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
|
||||
@@ -196,8 +196,10 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
|
||||
block = fix_image_block(block, img_blocks)
|
||||
elif block_type == BlockType.Table:
|
||||
block = fix_table_block(block, table_blocks)
|
||||
elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
|
||||
elif block_type in [BlockType.Text, BlockType.Title]:
|
||||
block = fix_text_block(block)
|
||||
elif block_type == BlockType.InterlineEquation:
|
||||
block = fix_interline_block(block)
|
||||
else:
|
||||
continue
|
||||
fix_blocks.append(block)
|
||||
@@ -315,6 +317,18 @@ def fix_table_block(block, table_blocks):
|
||||
|
||||
|
||||
def fix_text_block(block):
|
||||
# 文本block中的公式span都应该转换成行内type
|
||||
for span in block['spans']:
|
||||
if span['type'] == ContentType.InterlineEquation:
|
||||
span['type'] = ContentType.InlineEquation
|
||||
block_lines = merge_spans_to_line(block['spans'])
|
||||
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
||||
block['lines'] = sort_block_lines
|
||||
del block['spans']
|
||||
return block
|
||||
|
||||
|
||||
def fix_interline_block(block):
|
||||
block_lines = merge_spans_to_line(block['spans'])
|
||||
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
||||
block['lines'] = sort_block_lines
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
"""
|
||||
用户输入:
|
||||
model数组,每个元素代表一个页面
|
||||
@@ -16,14 +15,16 @@ import re
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.version import __version__
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.rw import AbsReaderWriter
|
||||
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
|
||||
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
|
||||
|
||||
|
||||
PARSE_TYPE_TXT = "txt"
|
||||
PARSE_TYPE_OCR = "ocr"
|
||||
|
||||
|
||||
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
|
||||
**kwargs):
|
||||
"""
|
||||
@@ -39,6 +40,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
||||
|
||||
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
|
||||
|
||||
pdf_info_dict["_version_name"] = __version__
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
|
||||
@@ -57,10 +60,13 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
||||
|
||||
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
|
||||
|
||||
pdf_info_dict["_version_name"] = __version__
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
|
||||
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
|
||||
input_model_is_empty: bool = False,
|
||||
*args, **kwargs):
|
||||
"""
|
||||
ocr和文本混合的pdf,全部解析出来
|
||||
@@ -88,7 +94,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
||||
for span in line['spans']:
|
||||
text_all += span['content']
|
||||
|
||||
def calculate_garbled_rate(text):
|
||||
def calculate_not_common_character_rate(text):
|
||||
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
|
||||
# 计算乱码字符的数量
|
||||
garbage_count = len(garbage_regex.findall(text))
|
||||
@@ -97,10 +103,30 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
||||
return 0 # 避免除以零的错误
|
||||
return garbage_count / total
|
||||
|
||||
garbled_rate = calculate_garbled_rate(text_all)
|
||||
def calculate_not_printable_rate(text):
|
||||
printable_text = ""
|
||||
for c in text:
|
||||
if c.isprintable():
|
||||
printable_text += c
|
||||
printable_total = len(printable_text)
|
||||
total = len(text)
|
||||
if total == 0:
|
||||
return 0 # 避免除以零的错误
|
||||
return (total - printable_total) / total
|
||||
|
||||
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or garbled_rate > 0.8:
|
||||
not_common_character_rate = calculate_not_common_character_rate(text_all)
|
||||
not_printable_rate = calculate_not_printable_rate(text_all)
|
||||
pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
|
||||
pdf_info_dict["_not_printable_rate"] = not_printable_rate
|
||||
logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
|
||||
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
|
||||
if (pdf_info_dict is None
|
||||
or pdf_info_dict.get("_need_drop", False)
|
||||
or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
|
||||
):
|
||||
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
|
||||
if input_model_is_empty:
|
||||
pdf_models = doc_analyze(pdf_bytes, ocr=True)
|
||||
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
||||
if pdf_info_dict is None:
|
||||
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
|
||||
@@ -109,4 +135,6 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
||||
else:
|
||||
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
|
||||
|
||||
pdf_info_dict["_version_name"] = __version__
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
@@ -2,16 +2,16 @@ boto3>=1.28.43
|
||||
Brotli>=1.1.0
|
||||
click>=8.1.7
|
||||
Distance>=0.1.3
|
||||
PyMuPDF>=1.24.3
|
||||
PyMuPDF>=1.24.5
|
||||
loguru>=0.6.0
|
||||
matplotlib>=3.8.3
|
||||
numpy>=1.21.6
|
||||
pandas>=1.3.5
|
||||
pycld2>=0.41
|
||||
fast-langdetect>=0.1.1
|
||||
regex>=2023.12.25
|
||||
termcolor>=2.4.0
|
||||
wordninja>=2.0.0
|
||||
scikit-learn>=1.0.2
|
||||
nltk==3.8.1
|
||||
s3pathlib>=2.1.1
|
||||
pytest
|
||||
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
|
||||
49
setup.py
49
setup.py
@@ -1,5 +1,7 @@
|
||||
from setuptools import setup, find_packages
|
||||
import subprocess
|
||||
from magic_pdf.libs.version import __version__
|
||||
|
||||
|
||||
def parse_requirements(filename):
|
||||
with open(filename) as f:
|
||||
lines = f.read().splitlines()
|
||||
@@ -8,37 +10,26 @@ def parse_requirements(filename):
|
||||
|
||||
for line in lines:
|
||||
if "http" in line:
|
||||
pkg_name_with_version = line.split('/')[-1].split('-')[0]
|
||||
requires.append(pkg_name_with_version)
|
||||
pkg_name_without_url = line.split('@')[0].strip()
|
||||
requires.append(pkg_name_without_url)
|
||||
else:
|
||||
requires.append(line)
|
||||
|
||||
return requires
|
||||
|
||||
def get_version():
|
||||
command = ["git", "describe", "--tags"]
|
||||
try:
|
||||
version = subprocess.check_output(command).decode().strip()
|
||||
version_parts = version.split("-")
|
||||
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
|
||||
return version_parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
requires = parse_requirements('requirements.txt')
|
||||
|
||||
setup(
|
||||
name="magic_pdf", # 项目名
|
||||
# version="0.1.3", # 版本号
|
||||
version=get_version(), # 自动从tag中获取版本号
|
||||
packages=find_packages(), # 包含所有的包
|
||||
install_requires=requires, # 项目依赖的第三方库
|
||||
python_requires=">=3.9", # 项目依赖的 Python 版本
|
||||
# entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
|
||||
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
|
||||
zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False
|
||||
)
|
||||
if __name__ == '__main__':
|
||||
setup(
|
||||
name="magic_pdf", # 项目名
|
||||
version=__version__, # 自动从tag中获取版本号
|
||||
packages=find_packages(), # 包含所有的包
|
||||
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
|
||||
extras_require={
|
||||
"gpu": ["paddlepaddle-gpu"],
|
||||
"cpu": ["paddlepaddle"],
|
||||
},
|
||||
python_requires=">=3.9", # 项目依赖的 Python 版本
|
||||
# entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
|
||||
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
|
||||
zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False
|
||||
)
|
||||
|
||||
27
update_version.py
Normal file
27
update_version.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
|
||||
def get_version():
|
||||
command = ["git", "describe", "--tags"]
|
||||
try:
|
||||
version = subprocess.check_output(command).decode().strip()
|
||||
version_parts = version.split("-")
|
||||
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
|
||||
return version_parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
def write_version_to_commons(version):
|
||||
commons_path = os.path.join(os.path.dirname(__file__), 'magic_pdf', 'libs', 'version.py')
|
||||
with open(commons_path, 'w') as f:
|
||||
f.write(f'__version__ = "{version}"\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
version_name = get_version()
|
||||
write_version_to_commons(version_name)
|
||||
Reference in New Issue
Block a user