Compare commits

...

42 Commits

Author SHA1 Message Date
赵小蒙
54f31b65cb update cli 2024-06-04 19:10:29 +08:00
myhloli
4ce15c44f3 Update version.py with new version 2024-06-04 10:10:12 +00:00
赵小蒙
88f2245d86 update cli 2024-06-04 18:03:12 +08:00
myhloli
bc05526602 Update version.py with new version 2024-06-04 09:21:33 +00:00
赵小蒙
b18e9365fa Merge remote-tracking branch 'origin/master'
# Conflicts:
#	magic_pdf/libs/version.py
2024-06-04 17:21:01 +08:00
赵小蒙
48b6992b71 update workflow 2024-06-04 17:20:11 +08:00
myhloli
4f6171d19e Update version.py with new version 2024-06-04 09:15:57 +00:00
赵小蒙
595517054b update workflow 2024-06-04 17:14:07 +08:00
赵小蒙
705c4dcf30 update workflow 2024-06-04 17:10:59 +08:00
赵小蒙
ff52be3304 update workflow 2024-06-04 17:08:01 +08:00
赵小蒙
a68f4174cd update workflow 2024-06-04 16:58:19 +08:00
赵小蒙
2d0d5a8208 update workflow 2024-06-04 16:38:47 +08:00
赵小蒙
887a3d989b update workflow 2024-06-04 16:37:49 +08:00
赵小蒙
6ab1a65a6a fix error 2024-06-04 16:26:37 +08:00
赵小蒙
48d3032318 fix error 2024-06-04 16:03:06 +08:00
赵小蒙
ddde1b82f2 fix error 2024-06-04 15:49:46 +08:00
赵小蒙
c7a685b302 fix error 2024-06-04 15:12:56 +08:00
赵小蒙
93a59ff4a3 fix error 2024-06-04 15:04:27 +08:00
赵小蒙
ab8413811f fix error 2024-06-04 12:16:41 +08:00
赵小蒙
e73964fc12 fix error 2024-06-04 12:13:30 +08:00
赵小蒙
b74f17e439 fix error 2024-06-04 12:08:37 +08:00
赵小蒙
20278040a5 fix error 2024-06-04 11:59:43 +08:00
赵小蒙
9d0b4e95de fix error: version is 0.0.0 2024-06-04 11:51:39 +08:00
赵小蒙
7fd8d97edb fix error: version is 0.0.0 2024-06-04 11:48:24 +08:00
赵小蒙
1877055672 fix error 2024-06-04 11:44:49 +08:00
赵小蒙
75d0fa3d24 fix error 2024-06-04 11:38:15 +08:00
赵小蒙
07f6c49707 chanage update version logic 2024-06-04 11:33:57 +08:00
赵小蒙
1de37e4c65 add version_name to middle json 2024-06-04 11:15:52 +08:00
赵小蒙
bd1834284e add version_name to middle json 2024-06-03 18:51:38 +08:00
赵小蒙
496045f361 update annotation 2024-05-31 10:27:23 +08:00
赵小蒙
75478eda89 update setup 2024-05-30 10:26:10 +08:00
赵小蒙
3f3edc39f5 update setup 2024-05-30 10:25:02 +08:00
赵小蒙
97a4e47319 change garbled rate check from not_common_character_rate to not_printable_rate 2024-05-28 18:22:01 +08:00
myhloli
5de372245c Merge pull request #116 from papayalove/master
修复分段边界问题
2024-05-28 10:26:19 +08:00
Kaiwen Liu
135adac43d Merge branch 'magicpdf:master' into master 2024-05-28 10:21:11 +08:00
liukaiwen
ba52e33527 修复分段边界问题 2024-05-28 10:20:47 +08:00
myhloli
78ed786794 Merge pull request #115 from papayalove/master
修复边界问题(修复list拼接和reference分行问题)
2024-05-27 15:19:15 +08:00
liukaiwen
4ff09a2fbc 修复边界问题(修复list拼接和reference分行问题) 2024-05-27 15:16:00 +08:00
赵小蒙
f8548a8ea2 update PyMuPDF to 1.24.4 2024-05-27 14:40:17 +08:00
myhloli
10a95bcd05 Merge pull request #114 from papayalove/master
修复list拼接和reference分行问题
2024-05-24 16:34:07 +08:00
liukaiwen
dbdbaf58be Merge branch 'master' of github.com:papayalove/Magic-PDF 2024-05-24 16:31:11 +08:00
liukaiwen
afe92f07d6 修复list拼接和reference分行问题 2024-05-24 16:31:00 +08:00
9 changed files with 234 additions and 118 deletions

View File

@@ -11,8 +11,51 @@ on:
jobs:
build:
update-version:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: master
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Update version.py
run: |
python update_version.py
- name: Verify version.py
run: |
ls -l magic_pdf/libs/version.py
cat magic_pdf/libs/version.py
- name: Commit changes
run: |
git config --local user.email "moe@myhloli.com"
git config --local user.name "myhloli"
git add magic_pdf/libs/version.py
if git diff-index --quiet HEAD; then
echo "No changes to commit"
else
git commit -m "Update version.py with new version"
fi
id: commit_changes
- name: Push changes
if: steps.commit_changes.outcome == 'success'
env:
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
run: |
git push origin HEAD:master
build:
needs: [ update-version ]
runs-on: ubuntu-latest
strategy:
fail-fast: false
@@ -23,8 +66,14 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: master
fetch-depth: 0
- name: Verify version.py
run: |
ls -l magic_pdf/libs/version.py
cat magic_pdf/libs/version.py
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:

View File

@@ -27,6 +27,7 @@ import sys
import click
from loguru import logger
from pathlib import Path
from magic_pdf.libs.version import __version__
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
@@ -52,7 +53,7 @@ def prepare_env(pdf_file_name, method):
get_local_dir(), "magic-pdf", pdf_file_name, method
)
local_image_dir = os.path.join(local_parent_dir, "images")
local_image_dir = os.path.join(str(local_parent_dir), "images")
local_md_dir = local_parent_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
@@ -97,6 +98,8 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
@click.group()
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.help_option("--help", "-h", help="显示帮助信息")
def cli():
pass
@@ -141,7 +144,7 @@ def json_command(json, method):
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
@@ -158,60 +161,60 @@ def json_command(json, method):
)
@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def local_json_command(local_json, method):
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def local_json_command(local_json, method):
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
)
with open(local_json, "r", encoding="utf-8") as f:
for json_line in f:
jso = json_parse.loads(json_line)
s3_file_path = jso.get("file_location")
if s3_file_path is None:
s3_file_path = jso.get("path")
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
with open(local_json, "r", encoding="utf-8") as f:
for json_line in f:
jso = json_parse.loads(json_line)
s3_file_path = jso.get("file_location")
if s3_file_path is None:
s3_file_path = jso.get("path")
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
local_md_dir
)
@cli.command()

View File

@@ -1,6 +1,7 @@
import datetime
import json
import os, re, configparser
import subprocess
import time
import boto3
@@ -11,6 +12,7 @@ from botocore.config import Config
import fitz # 1.23.9中已经切换到rebase
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
def get_delta_time(input_time):
return round(time.time() - input_time, 2)

View File

@@ -0,0 +1 @@
__version__ = "0.4.24"

View File

@@ -1,7 +1,7 @@
from sklearn.cluster import DBSCAN
import numpy as np
from loguru import logger
import re
from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.model.magic_model import MagicModel
@@ -106,16 +106,19 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
3. 如果非顶格首字符大写编码为2
4. 如果非顶格首字符非大写编码为3
"""
if len(lines) > 0:
x_map_tag_dict, min_x_tag = cluster_line_x(lines)
for l in lines:
first_char = __get_span_text(l['spans'][0])[0]
span_text = __get_span_text(l['spans'][0])
first_char = span_text[0]
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
if not layout:
line_fea_encode.append(0)
else:
layout_left = layout[0]
if l['bbox'][0] == layout_left:
#
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
if not first_char.isalnum():
if not first_char.isalnum() or if_match_reference_list(span_text):
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
@@ -144,6 +147,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
return split_indices(total_lines, list_indice), list_start_idx
def cluster_line_x(lines: list) -> dict:
"""
对一个block内所有lines的bbox的x0聚类
"""
min_distance = 5
min_sample = 1
x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
x0_uniq_label = np.unique(x0_clusters.labels_)
#x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
x0_2_new_val = {} # 存储旧值对应的新值映射
min_x0 = round(lines[0]["bbox"][0])
for label in x0_uniq_label:
if label == -1:
continue
x0_index_of_label = np.where(x0_clusters.labels_ == label)
x0_raw_val = x0_lst[x0_index_of_label][:, 0]
x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
if x0_new_val < min_x0:
min_x0 = x0_new_val
return x0_2_new_val, min_x0
def if_match_reference_list(text: str) -> bool:
pattern = re.compile(r'^\d+\..*')
if pattern.match(text):
return True
else:
return False
def __valign_lines(blocks, layout_bboxes):
"""
@@ -315,10 +348,11 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
"""
for list_start in list_start_line:
if len(list_start) > 1:
for i in range(1, len(list_start)):
for i in range(0, len(list_start)):
index = list_start[i] - 1
if "content" in lines[index]["spans"][-1]:
lines[index]["spans"][-1]["content"] += '\n\n'
if index >= 0:
if "content" in lines[index]["spans"][-1]:
lines[index]["spans"][-1]["content"] += '\n\n'
layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头列表结尾
for content_type, start, end in text_segments:
if content_type == 'list':
@@ -388,20 +422,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = []
for j in range(len(next_paras)):
lines = next_paras[j].get("lines", [])
if len(lines) == 1: # 只可能是一行,多行情况再需要分析了
if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]:
may_list_lines.append(lines[0])
else:
break
lines = next_first_para.get("lines", [])
for line in lines:
if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
may_list_lines.append(line)
else:
break
# 如果这些行的缩进是相等的那么连到上一个layout的最后一个段落上。
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
pre_last_para.extend(may_list_lines)
blocks_group[i] = blocks_group[i][len(may_list_lines):]
# layout_paras[i] = layout_paras[i][len(may_list_lines):]
next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]
return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
@@ -422,18 +453,14 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = []
for j in range(len(next_page_paras[0])):
next_page_block_j = next_page_paras[0][j]
if next_page_block_j["type"] != BlockType.Text:
break
lines = next_page_block_j["lines"]
if len(lines) == 1: # 只可能是一行,多行情况再需要分析了
if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], next_page_layout_bbox)[0]:
may_list_lines.append(lines[0])
next_page_first_para = next_page_paras[0][0]
if next_page_first_para["type"] == BlockType.Text:
lines = next_page_first_para["lines"]
for line in lines:
if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
may_list_lines.append(line)
else:
break
else:
break
# 如果这些行的缩进是相等的那么连到上一个layout的最后一个段落上。
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
#pre_page_paras[-1].append(may_list_lines)
@@ -442,7 +469,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
for span in line["spans"]:
span[CROSS_PAGE] = True
pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
next_page_paras[0] = next_page_paras[0][len(may_list_lines):]
next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
return True
return False
@@ -471,7 +498,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
if len(blocks_group) == 0:
return connected_layout_blocks
#connected_layout_paras.append(layout_paras[0])
connected_layout_blocks.append(blocks_group[0])
for i in range(1, len(blocks_group)):
try:
@@ -484,6 +510,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
connected_layout_blocks.append(blocks_group[i])
continue
if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
connected_layout_blocks.append(blocks_group[i])
continue
pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
next_first_line = blocks_group[i][0]["lines"][0]
except Exception as e:
@@ -505,7 +534,7 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
pre_last_line_text = pre_last_line_text.strip()
next_first_line_text = next_first_line_text.strip()
if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
"""连接段落条件成立将前一个layout的段落和后一个layout的段落连接。"""
#connected_layout_paras[-1][-1].extend(layout_paras[i][0])

View File

@@ -16,11 +16,11 @@ import re
from loguru import logger
from magic_pdf.libs.version import __version__
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"
@@ -39,6 +39,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__
return pdf_info_dict
@@ -57,6 +59,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
pdf_info_dict["_version_name"] = __version__
return pdf_info_dict
@@ -88,7 +92,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
for span in line['spans']:
text_all += span['content']
def calculate_garbled_rate(text):
def calculate_not_common_character_rate(text):
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# 计算乱码字符的数量
garbage_count = len(garbage_regex.findall(text))
@@ -97,9 +101,18 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return 0 # 避免除以零的错误
return garbage_count / total
garbled_rate = calculate_garbled_rate(text_all)
def calculate_not_printable_rate(text):
printable = sum(1 for c in text if c.isprintable())
total = len(text)
if total == 0:
return 0 # 避免除以零的错误
return (total - printable) / total
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or garbled_rate > 0.8:
# not_common_character_rate = calculate_not_common_character_rate(text_all)
not_printable_rate = calculate_not_printable_rate(text_all)
# 测试乱码pdfnot_common_character_rate > 0.95, not_printable_rate > 0.15
# not_common_character_rate对小语种可能会有误伤not_printable_rate对小语种较为友好
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or not_printable_rate > 0.1:
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
@@ -109,4 +122,6 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__
return pdf_info_dict

View File

@@ -2,7 +2,7 @@ boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
Distance>=0.1.3
PyMuPDF>=1.24.3
PyMuPDF>=1.24.4
loguru>=0.6.0
matplotlib>=3.8.3
numpy>=1.21.6

View File

@@ -1,5 +1,10 @@
from setuptools import setup, find_packages
import os
import subprocess
from setuptools import setup, find_packages
from magic_pdf.libs.version import __version__
def parse_requirements(filename):
with open(filename) as f:
lines = f.read().splitlines()
@@ -8,37 +13,22 @@ def parse_requirements(filename):
for line in lines:
if "http" in line:
pkg_name_with_version = line.split('/')[-1].split('-')[0]
requires.append(pkg_name_with_version)
pkg_name_without_url = line.split('@')[0].strip()
requires.append(pkg_name_without_url)
else:
requires.append(line)
return requires
def get_version():
command = ["git", "describe", "--tags"]
try:
version = subprocess.check_output(command).decode().strip()
version_parts = version.split("-")
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
return version_parts[1]
else:
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
except Exception as e:
print(e)
return "0.0.0"
requires = parse_requirements('requirements.txt')
setup(
name="magic_pdf", # 项目名
# version="0.1.3", # 版本号
version=get_version(), # 自动从tag中获取版本
packages=find_packages(), # 包含所有的包
install_requires=requires, # 项目依赖的第三方库
python_requires=">=3.9", # 项目依赖的 Python 版本
# entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False
)
if __name__ == '__main__':
setup(
name="magic_pdf", # 项目名
version=__version__, # 自动从tag中获取版本号
packages=find_packages(), # 包含所有的包
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
python_requires=">=3.9", # 项目依赖的 Python 版本
# entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False
)

27
update_version.py Normal file
View File

@@ -0,0 +1,27 @@
import os
import subprocess
def get_version():
command = ["git", "describe", "--tags"]
try:
version = subprocess.check_output(command).decode().strip()
version_parts = version.split("-")
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
return version_parts[1]
else:
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
except Exception as e:
print(e)
return "0.0.0"
def write_version_to_commons(version):
commons_path = os.path.join(os.path.dirname(__file__), 'magic_pdf', 'libs', 'version.py')
with open(commons_path, 'w') as f:
f.write(f'__version__ = "{version}"\n')
if __name__ == '__main__':
version_name = get_version()
write_version_to_commons(version_name)