mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
19 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8998380da5 | ||
|
|
35a700da94 | ||
|
|
38de8d59a8 | ||
|
|
df14c61f6f | ||
|
|
89d7964c74 | ||
|
|
5de013e6d5 | ||
|
|
6d79b1c7ce | ||
|
|
5f313bd0b4 | ||
|
|
3b7342b894 | ||
|
|
9dc5033cf7 | ||
|
|
389826c5fe | ||
|
|
c96aa88d13 | ||
|
|
738f9274a9 | ||
|
|
084dc22ab1 | ||
|
|
6c52856d2a | ||
|
|
c69f414b20 | ||
|
|
0306d66d25 | ||
|
|
35d39735da | ||
|
|
e57a9d87c7 |
10
.github/workflows/python-package.yml
vendored
10
.github/workflows/python-package.yml
vendored
@@ -119,8 +119,8 @@ jobs:
|
||||
files: './dist/*.whl'
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
user: __token__
|
||||
password: ${{ secrets.PYPI_TOKEN }}
|
||||
|
||||
- name: Publish distribution to PyPI
|
||||
run: |
|
||||
pip install twine
|
||||
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
|
||||
|
||||
@@ -31,7 +31,6 @@ from magic_pdf.libs.version import __version__
|
||||
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
|
||||
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
|
||||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||||
from magic_pdf.pipe.OCRPipe import OCRPipe
|
||||
from magic_pdf.pipe.TXTPipe import TXTPipe
|
||||
@@ -101,18 +100,34 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
|
||||
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
|
||||
|
||||
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
|
||||
'''写markdown'''
|
||||
md_writer.write(
|
||||
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
|
||||
)
|
||||
'''写middle_json'''
|
||||
md_writer.write(
|
||||
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
|
||||
path=f"{pdf_file_name}.json",
|
||||
path=f"{pdf_file_name}_middle.json",
|
||||
mode=AbsReaderWriter.MODE_TXT,
|
||||
)
|
||||
|
||||
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
||||
'''写model_json'''
|
||||
md_writer.write(
|
||||
str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
|
||||
content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
|
||||
path=f"{pdf_file_name}_model.json",
|
||||
mode=AbsReaderWriter.MODE_TXT,
|
||||
)
|
||||
'''写源pdf'''
|
||||
md_writer.write(
|
||||
content=pdf_bytes,
|
||||
path=f"{pdf_file_name}_origin.pdf",
|
||||
mode=AbsReaderWriter.MODE_BIN,
|
||||
)
|
||||
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
||||
'''写content_list'''
|
||||
md_writer.write(
|
||||
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
|
||||
path=f"{pdf_file_name}_content_list.json",
|
||||
mode=AbsReaderWriter.MODE_TXT
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -144,10 +144,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
||||
def merge_para_with_text(para_block):
|
||||
para_text = ''
|
||||
for line in para_block['lines']:
|
||||
line_text = ""
|
||||
line_lang = ""
|
||||
for span in line['spans']:
|
||||
span_type = span['type']
|
||||
if span_type == ContentType.Text:
|
||||
line_text += span['content'].strip()
|
||||
if line_text != "":
|
||||
line_lang = detect_lang(line_text)
|
||||
for span in line['spans']:
|
||||
span_type = span['type']
|
||||
content = ''
|
||||
language = ''
|
||||
if span_type == ContentType.Text:
|
||||
content = span['content']
|
||||
language = detect_lang(content)
|
||||
@@ -161,7 +168,7 @@ def merge_para_with_text(para_block):
|
||||
content = f"\n$$\n{span['content']}\n$$\n"
|
||||
|
||||
if content != '':
|
||||
if 'zh' in language:
|
||||
if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
|
||||
para_text += content # 中文语境下,content间不需要空格分隔
|
||||
else:
|
||||
para_text += content + ' ' # 英文语境下 content间需要空格分隔
|
||||
|
||||
@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
|
||||
from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
|
||||
|
||||
TEXT_LEN_THRESHOLD = 100
|
||||
AVG_TEXT_LEN_THRESHOLD = 200
|
||||
AVG_TEXT_LEN_THRESHOLD = 100
|
||||
TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计
|
||||
|
||||
|
||||
@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
|
||||
# 如果宽达标,检测是否能竖着拼
|
||||
if full_width:
|
||||
# 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
|
||||
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
|
||||
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
|
||||
last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
|
||||
|
||||
# 如果高达标,检测是否可以横着拼
|
||||
if full_height:
|
||||
# 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
|
||||
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
|
||||
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
|
||||
last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
|
||||
|
||||
# Check if the image can be merged with the last image
|
||||
if (full_width and close1) or (full_height and close2):
|
||||
@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
|
||||
# 先对每个id出现的次数做个统计
|
||||
objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
|
||||
# 再去掉出现次数大于10的
|
||||
if total_page >= scan_max_page:# 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
|
||||
if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
|
||||
total_page = scan_max_page
|
||||
|
||||
|
||||
repeat_threshold = 2 # 把bad_image的阈值设为2
|
||||
# repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img
|
||||
bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
|
||||
@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
|
||||
# if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值
|
||||
# return True
|
||||
|
||||
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in img_sz_list] # 过滤掉重复出现的图片
|
||||
|
||||
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
|
||||
img_sz_list] # 过滤掉重复出现的图片
|
||||
|
||||
# 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
|
||||
img_sz_list = merge_images(img_sz_list, page_width, page_height)
|
||||
|
||||
# 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
|
||||
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in img_sz_list]
|
||||
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
|
||||
img_sz_list]
|
||||
page_area = page_width * page_height
|
||||
max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
|
||||
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
|
||||
|
||||
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
|
||||
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
|
||||
# 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
|
||||
def classify_by_text_len(text_len_list: list, total_page: int):
|
||||
"""
|
||||
随机抽取10%的页面,如果少于5个页面,那么就取全部页面。
|
||||
@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
|
||||
is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
|
||||
return is_text_pdf
|
||||
|
||||
|
||||
def classify_by_avg_words(text_len_list: list):
|
||||
"""
|
||||
补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf
|
||||
@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):
|
||||
|
||||
return is_text_pdf
|
||||
|
||||
|
||||
def classify_by_img_num(img_sz_list: list, img_num_list: list):
|
||||
"""
|
||||
补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重,
|
||||
@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
|
||||
# img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min
|
||||
if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
|
||||
|
||||
#拿max和min的值,用来判断list内的值是否全都相等
|
||||
# min_imgs = min(img_num_list)
|
||||
# max_imgs = max(img_num_list)
|
||||
#
|
||||
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
|
||||
#拿max和min的值,用来判断list内的值是否全都相等
|
||||
# min_imgs = min(img_num_list)
|
||||
# max_imgs = max(img_num_list)
|
||||
#
|
||||
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
|
||||
return False # 如果满足这个条件,一定不是文字版pdf
|
||||
else:
|
||||
return True # 不满足这三个条件,可能是文字版pdf,通过其他规则判断
|
||||
@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
|
||||
else:
|
||||
return False # 文本布局未知,默认认为不是文字版pdf
|
||||
|
||||
|
||||
def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
||||
"""
|
||||
判断一页是否由细长条组成,有两个条件:
|
||||
@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
||||
Returns:
|
||||
bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False
|
||||
"""
|
||||
|
||||
def is_narrow_strip(img):
|
||||
x0, y0, x1, y1, _ = img
|
||||
width, height = x1 - x0, y1 - y0
|
||||
@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
||||
return narrow_strip_pages_ratio < 0.5
|
||||
|
||||
|
||||
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
|
||||
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
|
||||
text_layout_list: list, invalid_chars: bool):
|
||||
"""
|
||||
这里的图片和页面长度单位是pts
|
||||
:param total_page:
|
||||
@@ -316,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
|
||||
'by_avg_words': classify_by_avg_words(text_len_list),
|
||||
'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
|
||||
'by_text_layout': classify_by_text_layout(text_layout_list),
|
||||
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list)
|
||||
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
|
||||
'by_invalid_chars': invalid_chars,
|
||||
}
|
||||
|
||||
if all(results.values()):
|
||||
@@ -324,7 +331,12 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
|
||||
elif not any(results.values()):
|
||||
return False, results
|
||||
else:
|
||||
logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
|
||||
logger.warning(
|
||||
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
|
||||
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
|
||||
f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
|
||||
f" by_invalid_chars: {results['by_invalid_chars']}",
|
||||
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
|
||||
return False, results
|
||||
|
||||
|
||||
|
||||
@@ -12,12 +12,13 @@ from collections import Counter
|
||||
|
||||
from magic_pdf.libs.drop_reason import DropReason
|
||||
from magic_pdf.libs.language import detect_lang
|
||||
from magic_pdf.libs.pdf_check import detect_invalid_chars
|
||||
|
||||
scan_max_page = 50
|
||||
junk_limit_min = 10
|
||||
|
||||
|
||||
def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
|
||||
def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
|
||||
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
|
||||
result]
|
||||
page_area = int(page_width_pts) * int(page_height_pts)
|
||||
@@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p
|
||||
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
|
||||
return max_image_area_per_page
|
||||
|
||||
|
||||
def process_image(page, junk_img_bojids=[]):
|
||||
page_result = []# 存每个页面里的多张图四元组信息
|
||||
page_result = [] # 存每个页面里的多张图四元组信息
|
||||
items = page.get_images()
|
||||
dedup = set()
|
||||
for img in items:
|
||||
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
|
||||
img_bojid = img[0]# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
|
||||
if img_bojid in junk_img_bojids:# 如果是垃圾图像,就跳过
|
||||
img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
|
||||
if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
|
||||
continue
|
||||
recs = page.get_image_rects(img, transform=True)
|
||||
if recs:
|
||||
@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
|
||||
dedup.add((x0, y0, x1, y1, img_bojid))
|
||||
page_result.append([x0, y0, x1, y1, img_bojid])
|
||||
return page_result
|
||||
|
||||
|
||||
def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
||||
"""
|
||||
返回每个页面里的图片的四元组,每个页面多个图片。
|
||||
@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
||||
img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
|
||||
# 找出出现次数超过 len(doc) 半数的 img_bojid
|
||||
|
||||
junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
|
||||
junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
|
||||
|
||||
junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
|
||||
|
||||
@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
||||
result.append(page_result)
|
||||
for item in result:
|
||||
if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
|
||||
if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版,就把junklist置空并break
|
||||
if max(imgs_len_list) == min(imgs_len_list) and max(
|
||||
imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
|
||||
junk_img_bojids = []
|
||||
else:# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
|
||||
else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
|
||||
pass
|
||||
break_loop = True
|
||||
break
|
||||
@@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
||||
# 检查前80%的元素是否都相等
|
||||
if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
|
||||
|
||||
# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
|
||||
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
|
||||
# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
|
||||
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
|
||||
|
||||
#前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
|
||||
max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
|
||||
if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
|
||||
junk_img_bojids = []
|
||||
else:# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
|
||||
else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
|
||||
pass
|
||||
else:# 每页图片数量不一致,需要清掉junklist全量跑前50页图片
|
||||
else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
|
||||
junk_img_bojids = []
|
||||
|
||||
#正式进入取前50页图片的信息流程
|
||||
@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
|
||||
median_width = page_width_list[len(page_width_list) // 2]
|
||||
median_height = page_height_list[len(page_height_list) // 2]
|
||||
|
||||
|
||||
return median_width, median_height
|
||||
|
||||
|
||||
@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
|
||||
|
||||
return text_len_lst
|
||||
|
||||
|
||||
def get_pdf_text_layout_per_page(doc: fitz.Document):
|
||||
"""
|
||||
根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
|
||||
@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
|
||||
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
|
||||
return text_layout_list
|
||||
|
||||
|
||||
'''定义一个自定义异常用来抛出单页svg太多的pdf'''
|
||||
|
||||
|
||||
class PageSvgsTooManyError(Exception):
|
||||
def __init__(self, message="Page SVGs are too many"):
|
||||
self.message = message
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
def get_svgs_per_page(doc: fitz.Document):
|
||||
svgs_len_list = []
|
||||
for page_id, page in enumerate(doc):
|
||||
@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
|
||||
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
|
||||
return svgs_len_list
|
||||
|
||||
|
||||
def get_imgs_per_page(doc: fitz.Document):
|
||||
imgs_len_list = []
|
||||
for page_id, page in enumerate(doc):
|
||||
@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
|
||||
return language
|
||||
|
||||
|
||||
def check_invalid_chars(pdf_bytes):
|
||||
"""
|
||||
乱码检测
|
||||
"""
|
||||
return detect_invalid_chars(pdf_bytes)
|
||||
|
||||
|
||||
def pdf_meta_scan(pdf_bytes: bytes):
|
||||
"""
|
||||
:param s3_pdf_path:
|
||||
@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
||||
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
|
||||
text_language = get_language(doc)
|
||||
# logger.info(f"text_language: {text_language}")
|
||||
|
||||
invalid_chars = check_invalid_chars(pdf_bytes)
|
||||
# logger.info(f"invalid_chars: {invalid_chars}")
|
||||
|
||||
# 最后输出一条json
|
||||
res = {
|
||||
@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
||||
# "svgs_per_page": svgs_per_page,
|
||||
"imgs_per_page": imgs_per_page, # 增加每页img数量list
|
||||
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
|
||||
"invalid_chars": invalid_chars,
|
||||
"metadata": doc.metadata
|
||||
}
|
||||
# logger.info(json.dumps(res, ensure_ascii=False))
|
||||
@@ -365,4 +385,4 @@ if __name__ == '__main__':
|
||||
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
|
||||
# doc = fitz.open("pdf", file_content)
|
||||
# text_layout_lst = get_pdf_text_layout_per_page(doc)
|
||||
# print(text_layout_lst)
|
||||
# print(text_layout_lst)
|
||||
|
||||
62
magic_pdf/libs/pdf_check.py
Normal file
62
magic_pdf/libs/pdf_check.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from io import BytesIO
|
||||
import re
|
||||
import fitz
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
|
||||
def calculate_sample_count(total_page: int):
|
||||
"""
|
||||
根据总页数和采样率计算采样页面的数量。
|
||||
"""
|
||||
select_page_cnt = min(10, total_page)
|
||||
return select_page_cnt
|
||||
|
||||
|
||||
def extract_pages(src_pdf_bytes: bytes):
|
||||
pdf_docs = fitz.open("pdf", src_pdf_bytes)
|
||||
total_page = len(pdf_docs)
|
||||
if total_page == 0:
|
||||
# 如果PDF没有页面,直接返回空文档
|
||||
logger.warning("PDF is empty, return empty document")
|
||||
return fitz.Document()
|
||||
select_page_cnt = calculate_sample_count(total_page)
|
||||
|
||||
page_num = np.random.choice(total_page, select_page_cnt, replace=False)
|
||||
sample_docs = fitz.Document()
|
||||
try:
|
||||
for index in page_num:
|
||||
sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return sample_docs
|
||||
|
||||
|
||||
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
||||
""""
|
||||
检测PDF中是否包含非法字符
|
||||
"""
|
||||
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
||||
sample_docs = extract_pages(src_pdf_bytes)
|
||||
sample_pdf_bytes = sample_docs.tobytes()
|
||||
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
||||
text = extract_text(sample_pdf_file_like_object)
|
||||
text = text.replace("\n", "")
|
||||
# logger.info(text)
|
||||
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
||||
cid_pattern = re.compile(r'\(cid:\d+\)')
|
||||
matches = cid_pattern.findall(text)
|
||||
cid_count = len(matches)
|
||||
cid_len = sum(len(match) for match in matches)
|
||||
text_len = len(text)
|
||||
if text_len == 0:
|
||||
cid_chars_radio = 0
|
||||
else:
|
||||
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
||||
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
||||
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
||||
if cid_chars_radio > 0.05:
|
||||
return False # 乱码文档
|
||||
else:
|
||||
return True # 正常文档
|
||||
@@ -1 +1 @@
|
||||
__version__ = "0.5.1"
|
||||
__version__ = "0.5.6"
|
||||
|
||||
61
magic_pdf/model/doc_analyze_by_custom_model.py
Normal file
61
magic_pdf/model/doc_analyze_by_custom_model.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import fitz
|
||||
import cv2
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
from magic_pdf.model.model_list import MODEL
|
||||
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
|
||||
|
||||
|
||||
def dict_compare(d1, d2):
|
||||
return d1.items() == d2.items()
|
||||
|
||||
|
||||
def remove_duplicates_dicts(lst):
|
||||
unique_dicts = []
|
||||
for dict_item in lst:
|
||||
if not any(
|
||||
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
|
||||
):
|
||||
unique_dicts.append(dict_item)
|
||||
return unique_dicts
|
||||
|
||||
|
||||
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
||||
images = []
|
||||
with fitz.open("pdf", pdf_bytes) as doc:
|
||||
for index in range(0, doc.page_count):
|
||||
page = doc[index]
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pm = page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# if width or height > 2000 pixels, don't enlarge the image
|
||||
# if pm.width > 2000 or pm.height > 2000:
|
||||
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
||||
|
||||
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
||||
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
||||
img_dict = {"img": img, "width": pm.width, "height": pm.height}
|
||||
images.append(img_dict)
|
||||
return images
|
||||
|
||||
|
||||
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
|
||||
images = load_images_from_pdf(pdf_bytes)
|
||||
custom_model = None
|
||||
if model == MODEL.Paddle:
|
||||
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
|
||||
else:
|
||||
pass
|
||||
model_json = []
|
||||
for index, img_dict in enumerate(images):
|
||||
img = img_dict["img"]
|
||||
page_width = img_dict["width"]
|
||||
page_height = img_dict["height"]
|
||||
result = custom_model(img)
|
||||
page_info = {"page_no": index, "height": page_height, "width": page_width}
|
||||
page_dict = {"layout_dets": result, "page_info": page_info}
|
||||
|
||||
model_json.append(page_dict)
|
||||
|
||||
return model_json
|
||||
@@ -1,125 +0,0 @@
|
||||
import random
|
||||
|
||||
import fitz
|
||||
import cv2
|
||||
from paddleocr import PPStructure
|
||||
from PIL import Image
|
||||
from loguru import logger
|
||||
import numpy as np
|
||||
|
||||
def region_to_bbox(region):
|
||||
x0 = region[0][0]
|
||||
y0 = region[0][1]
|
||||
x1 = region[2][0]
|
||||
y1 = region[2][1]
|
||||
return [x0, y0, x1, y1]
|
||||
|
||||
|
||||
def dict_compare(d1, d2):
|
||||
return d1.items() == d2.items()
|
||||
|
||||
|
||||
def remove_duplicates_dicts(lst):
|
||||
unique_dicts = []
|
||||
for dict_item in lst:
|
||||
if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
|
||||
unique_dicts.append(dict_item)
|
||||
return unique_dicts
|
||||
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
|
||||
ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
|
||||
|
||||
imgs = []
|
||||
with fitz.open("pdf", pdf_bytes) as doc:
|
||||
for index in range(0, doc.page_count):
|
||||
page = doc[index]
|
||||
dpi = 200
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pm = page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# if width or height > 2000 pixels, don't enlarge the image
|
||||
# if pm.width > 2000 or pm.height > 2000:
|
||||
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
||||
|
||||
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
||||
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
||||
img_dict = {
|
||||
"img": img,
|
||||
"width": pm.width,
|
||||
"height": pm.height
|
||||
}
|
||||
imgs.append(img_dict)
|
||||
|
||||
model_json = []
|
||||
for index, img_dict in enumerate(imgs):
|
||||
img = img_dict['img']
|
||||
page_width = img_dict['width']
|
||||
page_height = img_dict['height']
|
||||
result = ocr_engine(img)
|
||||
spans = []
|
||||
for line in result:
|
||||
line.pop('img')
|
||||
'''
|
||||
为paddle输出适配type no.
|
||||
title: 0 # 标题
|
||||
text: 1 # 文本
|
||||
header: 2 # abandon
|
||||
footer: 2 # abandon
|
||||
reference: 1 # 文本 or abandon
|
||||
equation: 8 # 行间公式 block
|
||||
equation: 14 # 行间公式 text
|
||||
figure: 3 # 图片
|
||||
figure_caption: 4 # 图片描述
|
||||
table: 5 # 表格
|
||||
table_caption: 6 # 表格描述
|
||||
'''
|
||||
if line['type'] == 'title':
|
||||
line['category_id'] = 0
|
||||
elif line['type'] in ['text', 'reference']:
|
||||
line['category_id'] = 1
|
||||
elif line['type'] == 'figure':
|
||||
line['category_id'] = 3
|
||||
elif line['type'] == 'figure_caption':
|
||||
line['category_id'] = 4
|
||||
elif line['type'] == 'table':
|
||||
line['category_id'] = 5
|
||||
elif line['type'] == 'table_caption':
|
||||
line['category_id'] = 6
|
||||
elif line['type'] == 'equation':
|
||||
line['category_id'] = 8
|
||||
elif line['type'] in ['header', 'footer']:
|
||||
line['category_id'] = 2
|
||||
else:
|
||||
logger.warning(f"unknown type: {line['type']}")
|
||||
|
||||
# 兼容不输出score的paddleocr版本
|
||||
if line.get("score") is None:
|
||||
line['score'] = 0.5 + random.random() * 0.5
|
||||
|
||||
res = line.pop('res', None)
|
||||
if res is not None and len(res) > 0:
|
||||
for span in res:
|
||||
new_span = {'category_id': 15,
|
||||
'bbox': region_to_bbox(span['text_region']),
|
||||
'score': span['confidence'],
|
||||
'text': span['text']
|
||||
}
|
||||
spans.append(new_span)
|
||||
|
||||
if len(spans) > 0:
|
||||
result.extend(spans)
|
||||
|
||||
result = remove_duplicates_dicts(result)
|
||||
|
||||
page_info = {
|
||||
"page_no": index,
|
||||
"height": page_height,
|
||||
"width": page_width
|
||||
}
|
||||
page_dict = {
|
||||
"layout_dets": result,
|
||||
"page_info": page_info
|
||||
}
|
||||
|
||||
model_json.append(page_dict)
|
||||
|
||||
return model_json
|
||||
2
magic_pdf/model/model_list.py
Normal file
2
magic_pdf/model/model_list.py
Normal file
@@ -0,0 +1,2 @@
|
||||
class MODEL:
|
||||
Paddle = "pp_structure_v2"
|
||||
75
magic_pdf/model/pp_structure_v2.py
Normal file
75
magic_pdf/model/pp_structure_v2.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import random
|
||||
|
||||
from loguru import logger
|
||||
from paddleocr import PPStructure
|
||||
|
||||
|
||||
def region_to_bbox(region):
|
||||
x0 = region[0][0]
|
||||
y0 = region[0][1]
|
||||
x1 = region[2][0]
|
||||
y1 = region[2][1]
|
||||
return [x0, y0, x1, y1]
|
||||
|
||||
|
||||
class CustomPaddleModel:
|
||||
def __init__(self, ocr: bool = False, show_log: bool = False):
|
||||
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
|
||||
|
||||
def __call__(self, img):
|
||||
result = self.model(img)
|
||||
spans = []
|
||||
for line in result:
|
||||
line.pop("img")
|
||||
"""
|
||||
为paddle输出适配type no.
|
||||
title: 0 # 标题
|
||||
text: 1 # 文本
|
||||
header: 2 # abandon
|
||||
footer: 2 # abandon
|
||||
reference: 1 # 文本 or abandon
|
||||
equation: 8 # 行间公式 block
|
||||
equation: 14 # 行间公式 text
|
||||
figure: 3 # 图片
|
||||
figure_caption: 4 # 图片描述
|
||||
table: 5 # 表格
|
||||
table_caption: 6 # 表格描述
|
||||
"""
|
||||
if line["type"] == "title":
|
||||
line["category_id"] = 0
|
||||
elif line["type"] in ["text", "reference"]:
|
||||
line["category_id"] = 1
|
||||
elif line["type"] == "figure":
|
||||
line["category_id"] = 3
|
||||
elif line["type"] == "figure_caption":
|
||||
line["category_id"] = 4
|
||||
elif line["type"] == "table":
|
||||
line["category_id"] = 5
|
||||
elif line["type"] == "table_caption":
|
||||
line["category_id"] = 6
|
||||
elif line["type"] == "equation":
|
||||
line["category_id"] = 8
|
||||
elif line["type"] in ["header", "footer"]:
|
||||
line["category_id"] = 2
|
||||
else:
|
||||
logger.warning(f"unknown type: {line['type']}")
|
||||
|
||||
# 兼容不输出score的paddleocr版本
|
||||
if line.get("score") is None:
|
||||
line["score"] = 0.5 + random.random() * 0.5
|
||||
|
||||
res = line.pop("res", None)
|
||||
if res is not None and len(res) > 0:
|
||||
for span in res:
|
||||
new_span = {
|
||||
"category_id": 15,
|
||||
"bbox": region_to_bbox(span["text_region"]),
|
||||
"score": span["confidence"],
|
||||
"text": span["text"],
|
||||
}
|
||||
spans.append(new_span)
|
||||
|
||||
if len(spans) > 0:
|
||||
result.extend(spans)
|
||||
|
||||
return result
|
||||
@@ -83,6 +83,7 @@ class AbsPipe(ABC):
|
||||
pdf_meta["text_len_per_page"],
|
||||
pdf_meta["imgs_per_page"],
|
||||
pdf_meta["text_layout_per_page"],
|
||||
pdf_meta["invalid_chars"],
|
||||
)
|
||||
if is_text_pdf:
|
||||
return AbsPipe.PIP_TXT
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
||||
from magic_pdf.pipe.AbsPipe import AbsPipe
|
||||
from magic_pdf.user_api import parse_ocr_pdf
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
||||
from magic_pdf.libs.json_compressor import JsonCompressor
|
||||
from magic_pdf.pipe.AbsPipe import AbsPipe
|
||||
|
||||
@@ -3,7 +3,7 @@ import json
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
||||
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
||||
from magic_pdf.libs.commons import join_path
|
||||
|
||||
@@ -16,7 +16,7 @@ import re
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.version import __version__
|
||||
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.rw import AbsReaderWriter
|
||||
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
|
||||
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
|
||||
@@ -86,41 +86,46 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
||||
return None
|
||||
|
||||
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
|
||||
text_all = ""
|
||||
for page_dict in pdf_info_dict['pdf_info']:
|
||||
for para_block in page_dict['para_blocks']:
|
||||
if para_block['type'] in ['title', 'text']:
|
||||
for line in para_block['lines']:
|
||||
for span in line['spans']:
|
||||
text_all += span['content']
|
||||
# text_all = ""
|
||||
# for page_dict in pdf_info_dict['pdf_info']:
|
||||
# for para_block in page_dict['para_blocks']:
|
||||
# if para_block['type'] in ['title', 'text']:
|
||||
# for line in para_block['lines']:
|
||||
# for span in line['spans']:
|
||||
# text_all += span['content']
|
||||
|
||||
def calculate_not_common_character_rate(text):
|
||||
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
|
||||
# 计算乱码字符的数量
|
||||
garbage_count = len(garbage_regex.findall(text))
|
||||
total = len(text)
|
||||
if total == 0:
|
||||
return 0 # 避免除以零的错误
|
||||
return garbage_count / total
|
||||
|
||||
def calculate_not_printable_rate(text):
|
||||
printable = sum(1 for c in text if c.isprintable())
|
||||
total = len(text)
|
||||
if total == 0:
|
||||
return 0 # 避免除以零的错误
|
||||
return (total - printable) / total
|
||||
|
||||
not_common_character_rate = calculate_not_common_character_rate(text_all)
|
||||
not_printable_rate = calculate_not_printable_rate(text_all)
|
||||
pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
|
||||
pdf_info_dict["_not_printable_rate"] = not_printable_rate
|
||||
logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
|
||||
# def calculate_not_common_character_rate(text):
|
||||
# garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
|
||||
# # 计算乱码字符的数量
|
||||
# garbage_count = len(garbage_regex.findall(text))
|
||||
# total = len(text)
|
||||
# if total == 0:
|
||||
# return 0 # 避免除以零的错误
|
||||
# return garbage_count / total
|
||||
#
|
||||
# def calculate_not_printable_rate(text):
|
||||
# printable_text = ""
|
||||
# for c in text:
|
||||
# if c.isprintable():
|
||||
# printable_text += c
|
||||
# printable_total = len(printable_text)
|
||||
# total = len(text)
|
||||
# if total == 0:
|
||||
# return 0 # 避免除以零的错误
|
||||
# return (total - printable_total) / total
|
||||
#
|
||||
# not_common_character_rate = calculate_not_common_character_rate(text_all)
|
||||
# not_printable_rate = calculate_not_printable_rate(text_all)
|
||||
# pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
|
||||
# pdf_info_dict["_not_printable_rate"] = not_printable_rate
|
||||
# logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
|
||||
'''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
|
||||
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
|
||||
if (pdf_info_dict is None
|
||||
or pdf_info_dict.get("_need_drop", False)
|
||||
or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
|
||||
or pdf_info_dict.get("_need_drop", False)
|
||||
# or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
|
||||
):
|
||||
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
|
||||
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
|
||||
if input_model_is_empty:
|
||||
pdf_models = doc_analyze(pdf_bytes, ocr=True)
|
||||
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
||||
|
||||
@@ -14,5 +14,5 @@ wordninja>=2.0.0
|
||||
scikit-learn>=1.0.2
|
||||
nltk==3.8.1
|
||||
s3pathlib>=2.1.1
|
||||
paddlepaddle
|
||||
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
|
||||
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
|
||||
pdfminer.six>=20231228
|
||||
4
setup.py
4
setup.py
@@ -24,6 +24,10 @@ if __name__ == '__main__':
|
||||
version=__version__, # 自动从tag中获取版本号
|
||||
packages=find_packages(), # 包含所有的包
|
||||
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
|
||||
extras_require={
|
||||
"gpu": ["paddlepaddle-gpu"],
|
||||
"cpu": ["paddlepaddle"],
|
||||
},
|
||||
python_requires=">=3.9", # 项目依赖的 Python 版本
|
||||
# entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
|
||||
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
|
||||
|
||||
Reference in New Issue
Block a user