mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
refactor(pdf_check): improve character detection using PyMuPDF
- Replace pdfminer with PyMuPDF for character detection - Implement new method detect_invalid_chars_by_pymupdf - Update check_invalid_chars in pdf_meta_scan.py to use new method - Add __replace_0xfffd function in pdf_parse_union_core_v2.py to handle special characters - Remove unused imports and update requirements.txt
This commit is contained in:
@@ -8,7 +8,7 @@ from loguru import logger
|
||||
from magic_pdf.config.drop_reason import DropReason
|
||||
from magic_pdf.libs.commons import get_top_percent_list, mymax
|
||||
from magic_pdf.libs.language import detect_lang
|
||||
from magic_pdf.libs.pdf_check import detect_invalid_chars
|
||||
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
|
||||
|
||||
scan_max_page = 50
|
||||
junk_limit_min = 10
|
||||
@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document):
|
||||
|
||||
def check_invalid_chars(pdf_bytes):
|
||||
"""乱码检测."""
|
||||
return detect_invalid_chars(pdf_bytes)
|
||||
return detect_invalid_chars_by_pymupdf(pdf_bytes)
|
||||
|
||||
|
||||
def pdf_meta_scan(pdf_bytes: bytes):
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from io import BytesIO
|
||||
import re
|
||||
import fitz
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
from pdfminer.high_level import extract_text
|
||||
# import re
|
||||
# from io import BytesIO
|
||||
# from pdfminer.high_level import extract_text
|
||||
|
||||
|
||||
def calculate_sample_count(total_page: int):
|
||||
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
|
||||
return select_page_cnt
|
||||
|
||||
|
||||
def extract_pages(src_pdf_bytes: bytes):
|
||||
def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
|
||||
pdf_docs = fitz.open("pdf", src_pdf_bytes)
|
||||
total_page = len(pdf_docs)
|
||||
if total_page == 0:
|
||||
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
|
||||
return sample_docs
|
||||
|
||||
|
||||
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
||||
""""
|
||||
检测PDF中是否包含非法字符
|
||||
# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
||||
# """"
|
||||
# 检测PDF中是否包含非法字符
|
||||
# """
|
||||
# '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
||||
# sample_docs = extract_pages(src_pdf_bytes)
|
||||
# sample_pdf_bytes = sample_docs.tobytes()
|
||||
# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
||||
# text = extract_text(sample_pdf_file_like_object)
|
||||
# text = text.replace("\n", "")
|
||||
# # logger.info(text)
|
||||
# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
||||
# cid_pattern = re.compile(r'\(cid:\d+\)')
|
||||
# matches = cid_pattern.findall(text)
|
||||
# cid_count = len(matches)
|
||||
# cid_len = sum(len(match) for match in matches)
|
||||
# text_len = len(text)
|
||||
# if text_len == 0:
|
||||
# cid_chars_radio = 0
|
||||
# else:
|
||||
# cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
||||
# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
||||
# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
||||
# if cid_chars_radio > 0.05:
|
||||
# return False # 乱码文档
|
||||
# else:
|
||||
# return True # 正常文档
|
||||
|
||||
|
||||
def count_replacement_characters(text: str) -> int:
|
||||
"""
|
||||
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
||||
统计字符串中 0xfffd 字符的数量。
|
||||
"""
|
||||
return text.count('\ufffd')
|
||||
|
||||
|
||||
def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
|
||||
sample_docs = extract_pages(src_pdf_bytes)
|
||||
sample_pdf_bytes = sample_docs.tobytes()
|
||||
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
||||
text = extract_text(sample_pdf_file_like_object)
|
||||
text = text.replace("\n", "")
|
||||
# logger.info(text)
|
||||
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
||||
cid_pattern = re.compile(r'\(cid:\d+\)')
|
||||
matches = cid_pattern.findall(text)
|
||||
cid_count = len(matches)
|
||||
cid_len = sum(len(match) for match in matches)
|
||||
text_len = len(text)
|
||||
doc_text = ""
|
||||
for page in sample_docs:
|
||||
page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
|
||||
doc_text += page_text
|
||||
text_len = len(doc_text)
|
||||
uffd_count = count_replacement_characters(doc_text)
|
||||
if text_len == 0:
|
||||
cid_chars_radio = 0
|
||||
uffd_chars_radio = 0
|
||||
else:
|
||||
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
||||
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
||||
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
||||
if cid_chars_radio > 0.05:
|
||||
uffd_chars_radio = uffd_count / text_len
|
||||
logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
|
||||
'''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
|
||||
if uffd_chars_radio > 0.01:
|
||||
return False # 乱码文档
|
||||
else:
|
||||
return True # 正常文档
|
||||
return True # 正常文档
|
||||
@@ -57,6 +57,13 @@ def __replace_STX_ETX(text_str: str):
|
||||
return text_str
|
||||
|
||||
|
||||
def __replace_0xfffd(text_str: str):
|
||||
"""Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
|
||||
if text_str:
|
||||
s = text_str.replace('\ufffd', " ")
|
||||
return s
|
||||
return text_str
|
||||
|
||||
def chars_to_content(span):
|
||||
# 检查span中的char是否为空
|
||||
if len(span['chars']) == 0:
|
||||
@@ -76,7 +83,8 @@ def chars_to_content(span):
|
||||
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
|
||||
content += ' '
|
||||
content += char['c']
|
||||
span['content'] = __replace_STX_ETX(content)
|
||||
|
||||
span['content'] = __replace_0xfffd(content)
|
||||
|
||||
del span['chars']
|
||||
|
||||
@@ -140,7 +148,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
||||
|
||||
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
||||
|
||||
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE)['blocks']
|
||||
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
|
||||
|
||||
all_pymu_chars = []
|
||||
for block in text_blocks_raw:
|
||||
|
||||
@@ -4,10 +4,10 @@ click>=8.1.7
|
||||
fast-langdetect==0.2.0
|
||||
loguru>=0.6.0
|
||||
numpy>=1.21.6,<2.0.0
|
||||
pdfminer.six==20231228
|
||||
pydantic>=2.7.2,<2.8.0
|
||||
PyMuPDF>=1.24.9
|
||||
scikit-learn>=1.0.2
|
||||
torch>=2.2.2,<=2.3.1
|
||||
transformers
|
||||
# pdfminer.six==20231228
|
||||
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
|
||||
|
||||
Reference in New Issue
Block a user