mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Merge pull request #1743 from xuzijie1995/master
Updata ext.py is_pdf function to support the pdf with Chinese characters and special characters
This commit is contained in:
1
.github/workflows/python-package.yml
vendored
1
.github/workflows/python-package.yml
vendored
@@ -123,4 +123,5 @@ jobs:
|
||||
- name: Publish distribution to PyPI
|
||||
run: |
|
||||
pip install twine
|
||||
twine check dist/*
|
||||
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
|
||||
|
||||
@@ -1,37 +1,51 @@
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import urllib.parse
|
||||
|
||||
|
||||
def is_pdf(filename, file):
|
||||
"""
|
||||
判断文件是否为PDF格式。
|
||||
判断文件是否为PDF格式,支持中文名和特殊字符。
|
||||
|
||||
:param filename: 文件名
|
||||
:param file: 文件对象
|
||||
:return: 如果文件是PDF格式,则返回True,否则返回False
|
||||
"""
|
||||
# 检查文件扩展名 https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况,先注释
|
||||
# if not filename.endswith('.pdf'):
|
||||
# return False
|
||||
try:
|
||||
# 对文件名进行URL解码,处理特殊字符
|
||||
decoded_filename = urllib.parse.unquote(filename)
|
||||
|
||||
# 检查MIME类型
|
||||
mime_type, _ = mimetypes.guess_type(decoded_filename)
|
||||
print(f"Detected MIME type: {mime_type}")
|
||||
|
||||
# 某些情况下mime_type可能为None,需要特殊处理
|
||||
if mime_type is None:
|
||||
# 只检查文件内容的PDF标识
|
||||
file_start = file.read(5)
|
||||
file.seek(0) # 重置文件指针
|
||||
return file_start.startswith(b'%PDF-')
|
||||
|
||||
if mime_type != 'application/pdf':
|
||||
return False
|
||||
|
||||
# 检查MIME类型
|
||||
mime_type, _ = mimetypes.guess_type(filename)
|
||||
print(mime_type)
|
||||
if mime_type != 'application/pdf':
|
||||
return False
|
||||
# 检查文件内容的PDF标识
|
||||
file_start = file.read(5)
|
||||
file.seek(0) # 重置文件指针
|
||||
if not file_start.startswith(b'%PDF-'):
|
||||
return False
|
||||
|
||||
# 可选:读取文件的前几KB内容并检查MIME类型
|
||||
# 这一步是可选的,用于更严格的检查
|
||||
# if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
|
||||
# return False
|
||||
|
||||
# 检查文件内容
|
||||
file_start = file.read(5)
|
||||
file.seek(0)
|
||||
if not file_start.startswith(b'%PDF-'):
|
||||
return False
|
||||
|
||||
return True
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error checking PDF format: {str(e)}")
|
||||
# 发生错误时,仍然尝试通过文件头判断
|
||||
try:
|
||||
file_start = file.read(5)
|
||||
file.seek(0)
|
||||
return file_start.startswith(b'%PDF-')
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
def url_is_pdf(file):
|
||||
|
||||
@@ -143,6 +143,14 @@
|
||||
"created_at": "2025-01-20T05:30:38Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 1578
|
||||
},
|
||||
{
|
||||
"name": "shniubobo",
|
||||
"id": 6594544,
|
||||
"comment_id": 2660086464,
|
||||
"created_at": "2025-02-14T19:15:25Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 1693
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user