Compare commits

...

19 Commits

Author SHA1 Message Date
赵小蒙
8998380da5 update check invalid_chars algorithm to improve accuracy 2024-06-20 11:18:15 +08:00
赵小蒙
35a700da94 update annotate 2024-06-19 16:53:51 +08:00
myhloli
38de8d59a8 Update version.py with new version 2024-06-19 08:06:42 +00:00
赵小蒙
df14c61f6f update: Enhance the capability to detect garbled document issues 2024-06-19 16:03:21 +08:00
赵小蒙
89d7964c74 Merge remote-tracking branch 'origin/master' 2024-06-19 12:55:06 +08:00
赵小蒙
5de013e6d5 fix:use line_lang instead of content_lang to concatenate para 2024-06-19 12:54:54 +08:00
myhloli
6d79b1c7ce Update version.py with new version 2024-06-18 10:43:48 +00:00
赵小蒙
5f313bd0b4 fix local write pdf file name bug 2024-06-18 15:44:45 +08:00
赵小蒙
3b7342b894 update cli output files 2024-06-18 15:39:27 +08:00
赵小蒙
9dc5033cf7 update requirements 2024-06-18 14:51:06 +08:00
赵小蒙
389826c5fe update custom model framework 2024-06-18 14:45:23 +08:00
myhloli
c96aa88d13 Merge pull request #119 from icecraft/feat/parallel_paddle
feat: parallelize paddle
2024-06-18 14:15:06 +08:00
blue
738f9274a9 feat: parallelize paddle 2024-06-18 13:57:40 +08:00
赵小蒙
084dc22ab1 update AVG_TEXT_LEN_THRESHOLD 200->100 2024-06-18 10:46:54 +08:00
赵小蒙
6c52856d2a remove useless import 2024-06-17 19:16:19 +08:00
赵小蒙
c69f414b20 update pypi upload logic 2024-06-17 12:38:40 +08:00
赵小蒙
0306d66d25 update pypi upload logic 2024-06-17 12:32:47 +08:00
赵小蒙
35d39735da update pypi upload logic 2024-06-17 12:21:03 +08:00
myhloli
e57a9d87c7 Update version.py with new version 2024-06-17 04:11:27 +00:00
19 changed files with 346 additions and 207 deletions

View File

@@ -119,8 +119,8 @@ jobs:
files: './dist/*.whl'
env:
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.PYPI_TOKEN }}
- name: Publish distribution to PyPI
run: |
pip install twine
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}

View File

@@ -31,7 +31,6 @@ from magic_pdf.libs.version import __version__
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
@@ -101,18 +100,34 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
'''写markdown'''
md_writer.write(
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
)
'''写middle_json'''
md_writer.write(
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}.json",
path=f"{pdf_file_name}_middle.json",
mode=AbsReaderWriter.MODE_TXT,
)
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
'''写model_json'''
md_writer.write(
str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}_model.json",
mode=AbsReaderWriter.MODE_TXT,
)
'''写源pdf'''
md_writer.write(
content=pdf_bytes,
path=f"{pdf_file_name}_origin.pdf",
mode=AbsReaderWriter.MODE_BIN,
)
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
'''写content_list'''
md_writer.write(
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}_content_list.json",
mode=AbsReaderWriter.MODE_TXT
)

View File

@@ -144,10 +144,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
def merge_para_with_text(para_block):
para_text = ''
for line in para_block['lines']:
line_text = ""
line_lang = ""
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
if line_text != "":
line_lang = detect_lang(line_text)
for span in line['spans']:
span_type = span['type']
content = ''
language = ''
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
@@ -161,7 +168,7 @@ def merge_para_with_text(para_block):
content = f"\n$$\n{span['content']}\n$$\n"
if content != '':
if 'zh' in language:
if 'zh' in line_lang: # 遇到一些一个字一个span的文档这种单字语言判断不准需要用整行文本判断
para_text += content # 中文语境下content间不需要空格分隔
else:
para_text += content + ' ' # 英文语境下 content间需要空格分隔

View File

@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
TEXT_LEN_THRESHOLD = 100
AVG_TEXT_LEN_THRESHOLD = 200
AVG_TEXT_LEN_THRESHOLD = 100
TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计
@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
# 如果宽达标,检测是否能竖着拼
if full_width:
# 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
# 如果高达标,检测是否可以横着拼
if full_height:
# 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
# Check if the image can be merged with the last image
if (full_width and close1) or (full_height and close2):
@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
# 先对每个id出现的次数做个统计
objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
# 再去掉出现次数大于10的
if total_page >= scan_max_page:# 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
total_page = scan_max_page
repeat_threshold = 2 # 把bad_image的阈值设为2
# repeat_threshold = min(2, total_page) # 当total_page为1时repeat_threshold为1会产生误判导致所有img变成bad_img
bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
# if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值
# return True
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in img_sz_list] # 过滤掉重复出现的图片
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
img_sz_list] # 过滤掉重复出现的图片
# 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
img_sz_list = merge_images(img_sz_list, page_width, page_height)
# 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in img_sz_list]
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
img_sz_list]
page_area = page_width * page_height
max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5适配3页里面有两页和两页里面有一页的情况
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5适配3页里面有两页和两页里面有一页的情况
# 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层其特点是id都一样
return False
else:
return True
def classify_by_text_len(text_len_list: list, total_page: int):
"""
随机抽取10%的页面如果少于5个页面那么就取全部页面。
@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
return is_text_pdf
def classify_by_avg_words(text_len_list: list):
"""
补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD就不是文字pdf
@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):
return is_text_pdf
def classify_by_img_num(img_sz_list: list, img_num_list: list):
"""
补充规则有一种扫描版本的PDF每一页都会放所有的扫描页进去在 metascan 时会被去重,
@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
# img_sz_list中非空元素的个数小于1前80%的元素都相等且最大值大于等于junk_limit_min
if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
#拿max和min的值,用来判断list内的值是否全都相等
# min_imgs = min(img_num_list)
# max_imgs = max(img_num_list)
#
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
#拿max和min的值,用来判断list内的值是否全都相等
# min_imgs = min(img_num_list)
# max_imgs = max(img_num_list)
#
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
return False # 如果满足这个条件一定不是文字版pdf
else:
return True # 不满足这三个条件可能是文字版pdf通过其他规则判断
@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
else:
return False # 文本布局未知默认认为不是文字版pdf
def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
"""
判断一页是否由细长条组成,有两个条件:
@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
Returns:
bool: 如果满足条件的页面的比例小于0.5返回True否则返回False
"""
def is_narrow_strip(img):
x0, y0, x1, y1, _ = img
width, height = x1 - x0, y1 - y0
@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
return narrow_strip_pages_ratio < 0.5
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
text_layout_list: list, invalid_chars: bool):
"""
这里的图片和页面长度单位是pts
:param total_page:
@@ -316,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
'by_avg_words': classify_by_avg_words(text_len_list),
'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
'by_text_layout': classify_by_text_layout(text_layout_list),
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list)
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
'by_invalid_chars': invalid_chars,
}
if all(results.values()):
@@ -324,7 +331,12 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
elif not any(results.values()):
return False, results
else:
logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊针对性修正分类算法
logger.warning(
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
f" by_invalid_chars: {results['by_invalid_chars']}",
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊针对性修正分类算法
return False, results

View File

@@ -12,12 +12,13 @@ from collections import Counter
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars
scan_max_page = 50
junk_limit_min = 10
def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
result]
page_area = int(page_width_pts) * int(page_height_pts)
@@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
return max_image_area_per_page
def process_image(page, junk_img_bojids=[]):
page_result = []# 存每个页面里的多张图四元组信息
page_result = [] # 存每个页面里的多张图四元组信息
items = page.get_images()
dedup = set()
for img in items:
# 这里返回的是图片在page上的实际展示的大小。返回一个数组每个元素第一部分是
img_bojid = img[0]# 在pdf文件中是全局唯一的如果这个图反复出现在pdf里那么就可能是垃圾信息例如水印、页眉页脚等
if img_bojid in junk_img_bojids:# 如果是垃圾图像,就跳过
img_bojid = img[0] # 在pdf文件中是全局唯一的如果这个图反复出现在pdf里那么就可能是垃圾信息例如水印、页眉页脚等
if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
continue
recs = page.get_image_rects(img, transform=True)
if recs:
@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
dedup.add((x0, y0, x1, y1, img_bojid))
page_result.append([x0, y0, x1, y1, img_bojid])
return page_result
def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
"""
返回每个页面里的图片的四元组,每个页面多个图片。
@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
# 找出出现次数超过 len(doc) 半数的 img_bojid
junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
result.append(page_result)
for item in result:
if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版就把junklist置空并break
if max(imgs_len_list) == min(imgs_len_list) and max(
imgs_len_list) >= junk_limit_min: # 如果是特殊文字版就把junklist置空并break
junk_img_bojids = []
else:# 不是特殊文字版是个普通文字版但是存在垃圾图片不置空junklist
else: # 不是特殊文字版是个普通文字版但是存在垃圾图片不置空junklist
pass
break_loop = True
break
@@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
# 检查前80%的元素是否都相等
if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
# # 如果前10页跑完都有图根据每页图片数量是否相等判断是否需要清除junklist
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
# # 如果前10页跑完都有图根据每页图片数量是否相等判断是否需要清除junklist
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
#前10页都有图且每页数量一致需要检测图片大小占页面的比例判断是否需要清除junklist
max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图说明可能是个文字版pdf把垃圾图片list置空
junk_img_bojids = []
else:# 前10页都有图而且80%都是大图且每页图片数量一致并都很多说明是扫描版1不需要清空junklist
else: # 前10页都有图而且80%都是大图且每页图片数量一致并都很多说明是扫描版1不需要清空junklist
pass
else:# 每页图片数量不一致需要清掉junklist全量跑前50页图片
else: # 每页图片数量不一致需要清掉junklist全量跑前50页图片
junk_img_bojids = []
#正式进入取前50页图片的信息流程
@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
median_width = page_width_list[len(page_width_list) // 2]
median_height = page_height_list[len(page_height_list) // 2]
return median_width, median_height
@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
return text_len_lst
def get_pdf_text_layout_per_page(doc: fitz.Document):
"""
根据PDF文档的每一页文本布局判断该页的文本布局是横向、纵向还是未知。
@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
return text_layout_list
'''定义一个自定义异常用来抛出单页svg太多的pdf'''
class PageSvgsTooManyError(Exception):
def __init__(self, message="Page SVGs are too many"):
self.message = message
super().__init__(self.message)
def get_svgs_per_page(doc: fitz.Document):
svgs_len_list = []
for page_id, page in enumerate(doc):
@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
return svgs_len_list
def get_imgs_per_page(doc: fitz.Document):
imgs_len_list = []
for page_id, page in enumerate(doc):
@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
return language
def check_invalid_chars(pdf_bytes):
"""
乱码检测
"""
return detect_invalid_chars(pdf_bytes)
def pdf_meta_scan(pdf_bytes: bytes):
"""
:param s3_pdf_path:
@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
text_language = get_language(doc)
# logger.info(f"text_language: {text_language}")
invalid_chars = check_invalid_chars(pdf_bytes)
# logger.info(f"invalid_chars: {invalid_chars}")
# 最后输出一条json
res = {
@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
# "svgs_per_page": svgs_per_page,
"imgs_per_page": imgs_per_page, # 增加每页img数量list
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
"invalid_chars": invalid_chars,
"metadata": doc.metadata
}
# logger.info(json.dumps(res, ensure_ascii=False))
@@ -365,4 +385,4 @@ if __name__ == '__main__':
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
# doc = fitz.open("pdf", file_content)
# text_layout_lst = get_pdf_text_layout_per_page(doc)
# print(text_layout_lst)
# print(text_layout_lst)

View File

@@ -0,0 +1,62 @@
from io import BytesIO
import re
import fitz
import numpy as np
from loguru import logger
from pdfminer.high_level import extract_text
def calculate_sample_count(total_page: int):
"""
根据总页数和采样率计算采样页面的数量。
"""
select_page_cnt = min(10, total_page)
return select_page_cnt
def extract_pages(src_pdf_bytes: bytes):
pdf_docs = fitz.open("pdf", src_pdf_bytes)
total_page = len(pdf_docs)
if total_page == 0:
# 如果PDF没有页面直接返回空文档
logger.warning("PDF is empty, return empty document")
return fitz.Document()
select_page_cnt = calculate_sample_count(total_page)
page_num = np.random.choice(total_page, select_page_cnt, replace=False)
sample_docs = fitz.Document()
try:
for index in page_num:
sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
except Exception as e:
logger.exception(e)
return sample_docs
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
""""
检测PDF中是否包含非法字符
"""
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes()
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
text = extract_text(sample_pdf_file_like_object)
text = text.replace("\n", "")
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
cid_len = sum(len(match) for match in matches)
text_len = len(text)
if text_len == 0:
cid_chars_radio = 0
else:
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
if cid_chars_radio > 0.05:
return False # 乱码文档
else:
return True # 正常文档

View File

@@ -1 +1 @@
__version__ = "0.5.1"
__version__ = "0.5.6"

View File

@@ -0,0 +1,61 @@
import fitz
import cv2
from PIL import Image
import numpy as np
from magic_pdf.model.model_list import MODEL
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
def dict_compare(d1, d2):
return d1.items() == d2.items()
def remove_duplicates_dicts(lst):
unique_dicts = []
for dict_item in lst:
if not any(
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
):
unique_dicts.append(dict_item)
return unique_dicts
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
images = []
with fitz.open("pdf", pdf_bytes) as doc:
for index in range(0, doc.page_count):
page = doc[index]
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_dict = {"img": img, "width": pm.width, "height": pm.height}
images.append(img_dict)
return images
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
images = load_images_from_pdf(pdf_bytes)
custom_model = None
if model == MODEL.Paddle:
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
else:
pass
model_json = []
for index, img_dict in enumerate(images):
img = img_dict["img"]
page_width = img_dict["width"]
page_height = img_dict["height"]
result = custom_model(img)
page_info = {"page_no": index, "height": page_height, "width": page_width}
page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict)
return model_json

View File

@@ -1,125 +0,0 @@
import random
import fitz
import cv2
from paddleocr import PPStructure
from PIL import Image
from loguru import logger
import numpy as np
def region_to_bbox(region):
x0 = region[0][0]
y0 = region[0][1]
x1 = region[2][0]
y1 = region[2][1]
return [x0, y0, x1, y1]
def dict_compare(d1, d2):
return d1.items() == d2.items()
def remove_duplicates_dicts(lst):
unique_dicts = []
for dict_item in lst:
if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
unique_dicts.append(dict_item)
return unique_dicts
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
imgs = []
with fitz.open("pdf", pdf_bytes) as doc:
for index in range(0, doc.page_count):
page = doc[index]
dpi = 200
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_dict = {
"img": img,
"width": pm.width,
"height": pm.height
}
imgs.append(img_dict)
model_json = []
for index, img_dict in enumerate(imgs):
img = img_dict['img']
page_width = img_dict['width']
page_height = img_dict['height']
result = ocr_engine(img)
spans = []
for line in result:
line.pop('img')
'''
为paddle输出适配type no.
title: 0 # 标题
text: 1 # 文本
header: 2 # abandon
footer: 2 # abandon
reference: 1 # 文本 or abandon
equation: 8 # 行间公式 block
equation: 14 # 行间公式 text
figure: 3 # 图片
figure_caption: 4 # 图片描述
table: 5 # 表格
table_caption: 6 # 表格描述
'''
if line['type'] == 'title':
line['category_id'] = 0
elif line['type'] in ['text', 'reference']:
line['category_id'] = 1
elif line['type'] == 'figure':
line['category_id'] = 3
elif line['type'] == 'figure_caption':
line['category_id'] = 4
elif line['type'] == 'table':
line['category_id'] = 5
elif line['type'] == 'table_caption':
line['category_id'] = 6
elif line['type'] == 'equation':
line['category_id'] = 8
elif line['type'] in ['header', 'footer']:
line['category_id'] = 2
else:
logger.warning(f"unknown type: {line['type']}")
# 兼容不输出score的paddleocr版本
if line.get("score") is None:
line['score'] = 0.5 + random.random() * 0.5
res = line.pop('res', None)
if res is not None and len(res) > 0:
for span in res:
new_span = {'category_id': 15,
'bbox': region_to_bbox(span['text_region']),
'score': span['confidence'],
'text': span['text']
}
spans.append(new_span)
if len(spans) > 0:
result.extend(spans)
result = remove_duplicates_dicts(result)
page_info = {
"page_no": index,
"height": page_height,
"width": page_width
}
page_dict = {
"layout_dets": result,
"page_info": page_info
}
model_json.append(page_dict)
return model_json

View File

@@ -0,0 +1,2 @@
class MODEL:
Paddle = "pp_structure_v2"

View File

@@ -0,0 +1,75 @@
import random
from loguru import logger
from paddleocr import PPStructure
def region_to_bbox(region):
x0 = region[0][0]
y0 = region[0][1]
x1 = region[2][0]
y1 = region[2][1]
return [x0, y0, x1, y1]
class CustomPaddleModel:
def __init__(self, ocr: bool = False, show_log: bool = False):
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
def __call__(self, img):
result = self.model(img)
spans = []
for line in result:
line.pop("img")
"""
为paddle输出适配type no.
title: 0 # 标题
text: 1 # 文本
header: 2 # abandon
footer: 2 # abandon
reference: 1 # 文本 or abandon
equation: 8 # 行间公式 block
equation: 14 # 行间公式 text
figure: 3 # 图片
figure_caption: 4 # 图片描述
table: 5 # 表格
table_caption: 6 # 表格描述
"""
if line["type"] == "title":
line["category_id"] = 0
elif line["type"] in ["text", "reference"]:
line["category_id"] = 1
elif line["type"] == "figure":
line["category_id"] = 3
elif line["type"] == "figure_caption":
line["category_id"] = 4
elif line["type"] == "table":
line["category_id"] = 5
elif line["type"] == "table_caption":
line["category_id"] = 6
elif line["type"] == "equation":
line["category_id"] = 8
elif line["type"] in ["header", "footer"]:
line["category_id"] = 2
else:
logger.warning(f"unknown type: {line['type']}")
# 兼容不输出score的paddleocr版本
if line.get("score") is None:
line["score"] = 0.5 + random.random() * 0.5
res = line.pop("res", None)
if res is not None and len(res) > 0:
for span in res:
new_span = {
"category_id": 15,
"bbox": region_to_bbox(span["text_region"]),
"score": span["confidence"],
"text": span["text"],
}
spans.append(new_span)
if len(spans) > 0:
result.extend(spans)
return result

View File

@@ -83,6 +83,7 @@ class AbsPipe(ABC):
pdf_meta["text_len_per_page"],
pdf_meta["imgs_per_page"],
pdf_meta["text_layout_per_page"],
pdf_meta["invalid_chars"],
)
if is_text_pdf:
return AbsPipe.PIP_TXT

View File

@@ -1,5 +1,5 @@
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf

View File

@@ -1,5 +1,5 @@
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe

View File

@@ -3,7 +3,7 @@ import json
from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path

View File

@@ -16,7 +16,7 @@ import re
from loguru import logger
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
@@ -86,41 +86,46 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
text_all = ""
for page_dict in pdf_info_dict['pdf_info']:
for para_block in page_dict['para_blocks']:
if para_block['type'] in ['title', 'text']:
for line in para_block['lines']:
for span in line['spans']:
text_all += span['content']
# text_all = ""
# for page_dict in pdf_info_dict['pdf_info']:
# for para_block in page_dict['para_blocks']:
# if para_block['type'] in ['title', 'text']:
# for line in para_block['lines']:
# for span in line['spans']:
# text_all += span['content']
def calculate_not_common_character_rate(text):
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# 计算乱码字符的数量
garbage_count = len(garbage_regex.findall(text))
total = len(text)
if total == 0:
return 0 # 避免除以零的错误
return garbage_count / total
def calculate_not_printable_rate(text):
printable = sum(1 for c in text if c.isprintable())
total = len(text)
if total == 0:
return 0 # 避免除以零的错误
return (total - printable) / total
not_common_character_rate = calculate_not_common_character_rate(text_all)
not_printable_rate = calculate_not_printable_rate(text_all)
pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
pdf_info_dict["_not_printable_rate"] = not_printable_rate
logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
# def calculate_not_common_character_rate(text):
# garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# # 计算乱码字符的数量
# garbage_count = len(garbage_regex.findall(text))
# total = len(text)
# if total == 0:
# return 0 # 避免除以零的错误
# return garbage_count / total
#
# def calculate_not_printable_rate(text):
# printable_text = ""
# for c in text:
# if c.isprintable():
# printable_text += c
# printable_total = len(printable_text)
# total = len(text)
# if total == 0:
# return 0 # 避免除以零的错误
# return (total - printable_total) / total
#
# not_common_character_rate = calculate_not_common_character_rate(text_all)
# not_printable_rate = calculate_not_printable_rate(text_all)
# pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
# pdf_info_dict["_not_printable_rate"] = not_printable_rate
# logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
'''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
# not_common_character_rate对小语种可能会有误伤not_printable_rate对小语种较为友好
if (pdf_info_dict is None
or pdf_info_dict.get("_need_drop", False)
or not_printable_rate > 0.02 # 参考一些正常的pdf这个值没有超过0.01的阈值设为0.02
or pdf_info_dict.get("_need_drop", False)
# or not_printable_rate > 0.02 # 参考一些正常的pdf这个值没有超过0.01的阈值设为0.02
):
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)

View File

@@ -14,5 +14,5 @@ wordninja>=2.0.0
scikit-learn>=1.0.2
nltk==3.8.1
s3pathlib>=2.1.1
paddlepaddle
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
pdfminer.six>=20231228

View File

@@ -24,6 +24,10 @@ if __name__ == '__main__':
version=__version__, # 自动从tag中获取版本号
packages=find_packages(), # 包含所有的包
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
extras_require={
"gpu": ["paddlepaddle-gpu"],
"cpu": ["paddlepaddle"],
},
python_requires=">=3.9", # 项目依赖的 Python 版本
# entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等