From ac88815620933dba8d657f68b59b07199509dcd3 Mon Sep 17 00:00:00 2001 From: myhloli Date: Thu, 28 Nov 2024 22:34:23 +0800 Subject: [PATCH] refactor(pdf_check): improve character detection using PyMuPDF - Replace pdfminer with PyMuPDF for character detection - Implement new method detect_invalid_chars_by_pymupdf - Update check_invalid_chars in pdf_meta_scan.py to use new method - Add __replace_0xfffd function in pdf_parse_union_core_v2.py to handle special characters - Remove unused imports and update requirements.txt --- magic_pdf/filter/pdf_meta_scan.py | 4 +- magic_pdf/libs/pdf_check.py | 77 +++++++++++++++++++--------- magic_pdf/pdf_parse_union_core_v2.py | 12 ++++- requirements.txt | 2 +- 4 files changed, 65 insertions(+), 30 deletions(-) diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py index 4345be55..400c1707 100644 --- a/magic_pdf/filter/pdf_meta_scan.py +++ b/magic_pdf/filter/pdf_meta_scan.py @@ -8,7 +8,7 @@ from loguru import logger from magic_pdf.config.drop_reason import DropReason from magic_pdf.libs.commons import get_top_percent_list, mymax from magic_pdf.libs.language import detect_lang -from magic_pdf.libs.pdf_check import detect_invalid_chars +from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf scan_max_page = 50 junk_limit_min = 10 @@ -323,7 +323,7 @@ def get_language(doc: fitz.Document): def check_invalid_chars(pdf_bytes): """乱码检测.""" - return detect_invalid_chars(pdf_bytes) + return detect_invalid_chars_by_pymupdf(pdf_bytes) def pdf_meta_scan(pdf_bytes: bytes): diff --git a/magic_pdf/libs/pdf_check.py b/magic_pdf/libs/pdf_check.py index 3f9dc350..fe40a45a 100644 --- a/magic_pdf/libs/pdf_check.py +++ b/magic_pdf/libs/pdf_check.py @@ -1,9 +1,9 @@ -from io import BytesIO -import re import fitz import numpy as np from loguru import logger -from pdfminer.high_level import extract_text +# import re +# from io import BytesIO +# from pdfminer.high_level import extract_text def calculate_sample_count(total_page: int): @@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int): return select_page_cnt -def extract_pages(src_pdf_bytes: bytes): +def extract_pages(src_pdf_bytes: bytes) -> fitz.Document: pdf_docs = fitz.open("pdf", src_pdf_bytes) total_page = len(pdf_docs) if total_page == 0: @@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes): return sample_docs -def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: - """" - 检测PDF中是否包含非法字符 +# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: +# """" +# 检测PDF中是否包含非法字符 +# """ +# '''pdfminer比较慢,需要先随机抽取10页左右的sample''' +# sample_docs = extract_pages(src_pdf_bytes) +# sample_pdf_bytes = sample_docs.tobytes() +# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) +# text = extract_text(sample_pdf_file_like_object) +# text = text.replace("\n", "") +# # logger.info(text) +# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' +# cid_pattern = re.compile(r'\(cid:\d+\)') +# matches = cid_pattern.findall(text) +# cid_count = len(matches) +# cid_len = sum(len(match) for match in matches) +# text_len = len(text) +# if text_len == 0: +# cid_chars_radio = 0 +# else: +# cid_chars_radio = cid_count/(cid_count + text_len - cid_len) +# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") +# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' +# if cid_chars_radio > 0.05: +# return False # 乱码文档 +# else: +# return True # 正常文档 + + +def count_replacement_characters(text: str) -> int: """ - '''pdfminer比较慢,需要先随机抽取10页左右的sample''' + 统计字符串中 0xfffd 字符的数量。 + """ + return text.count('\ufffd') + + +def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool: sample_docs = extract_pages(src_pdf_bytes) - sample_pdf_bytes = sample_docs.tobytes() - sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) - text = extract_text(sample_pdf_file_like_object) - text = text.replace("\n", "") - # logger.info(text) - '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' - cid_pattern = re.compile(r'\(cid:\d+\)') - matches = cid_pattern.findall(text) - cid_count = len(matches) - cid_len = sum(len(match) for match in matches) - text_len = len(text) + doc_text = "" + for page in sample_docs: + page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP) + doc_text += page_text + text_len = len(doc_text) + uffd_count = count_replacement_characters(doc_text) if text_len == 0: - cid_chars_radio = 0 + uffd_chars_radio = 0 else: - cid_chars_radio = cid_count/(cid_count + text_len - cid_len) - logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") - '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' - if cid_chars_radio > 0.05: + uffd_chars_radio = uffd_count / text_len + logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}") + '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档''' + if uffd_chars_radio > 0.01: return False # 乱码文档 else: - return True # 正常文档 + return True # 正常文档 \ No newline at end of file diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 885cf1d7..dd5f3237 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -57,6 +57,13 @@ def __replace_STX_ETX(text_str: str): return text_str +def __replace_0xfffd(text_str: str): + """Replace \ufffd, as these characters become garbled when extracted using pymupdf.""" + if text_str: + s = text_str.replace('\ufffd', " ") + return s + return text_str + def chars_to_content(span): # 检查span中的char是否为空 if len(span['chars']) == 0: @@ -76,7 +83,8 @@ def chars_to_content(span): if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width: content += ' ' content += char['c'] - span['content'] = __replace_STX_ETX(content) + + span['content'] = __replace_0xfffd(content) del span['chars'] @@ -140,7 +148,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): - text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE)['blocks'] + text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks'] all_pymu_chars = [] for block in text_blocks_raw: diff --git a/requirements.txt b/requirements.txt index eced1426..d308e30a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,10 +4,10 @@ click>=8.1.7 fast-langdetect==0.2.0 loguru>=0.6.0 numpy>=1.21.6,<2.0.0 -pdfminer.six==20231228 pydantic>=2.7.2,<2.8.0 PyMuPDF>=1.24.9 scikit-learn>=1.0.2 torch>=2.2.2,<=2.3.1 transformers +# pdfminer.six==20231228 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.