Merge pull request #1743 from xuzijie1995/master

Updata ext.py is_pdf function to support the pdf with Chinese characters and special characters
2026-03-27 11:08:32 +07:00 · 2025-02-24 10:44:27 +08:00
parent 8dd01346be 034034c6e0
commit 08dd3a85a8
3 changed files with 44 additions and 21 deletions
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -123,4 +123,5 @@ jobs:
      - name: Publish distribution to PyPI
        run: |
          pip install twine
+          twine check dist/*
          twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
--- a/projects/web_demo/web_demo/common/ext.py
+++ b/projects/web_demo/web_demo/common/ext.py
@@ -1,37 +1,51 @@
 import hashlib
 import mimetypes
+import urllib.parse


 def is_pdf(filename, file):
    """
-    判断文件是否为PDF格式。
+    判断文件是否为PDF格式，支持中文名和特殊字符。

    :param filename: 文件名
    :param file: 文件对象
    :return: 如果文件是PDF格式，则返回True，否则返回False
    """
-    # 检查文件扩展名  https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况，先注释
-    # if not filename.endswith('.pdf'):
-    #     return False
+    try:
+        # 对文件名进行URL解码，处理特殊字符
+        decoded_filename = urllib.parse.unquote(filename)
+        
+        # 检查MIME类型
+        mime_type, _ = mimetypes.guess_type(decoded_filename)
+        print(f"Detected MIME type: {mime_type}")
+        
+        # 某些情况下mime_type可能为None，需要特殊处理
+        if mime_type is None:
+            # 只检查文件内容的PDF标识
+            file_start = file.read(5)
+            file.seek(0)  # 重置文件指针
+            return file_start.startswith(b'%PDF-')
+            
+        if mime_type != 'application/pdf':
+            return False

-    # 检查MIME类型
-    mime_type, _ = mimetypes.guess_type(filename)
-    print(mime_type)
-    if mime_type != 'application/pdf':
-        return False
+        # 检查文件内容的PDF标识
+        file_start = file.read(5)
+        file.seek(0)  # 重置文件指针
+        if not file_start.startswith(b'%PDF-'):
+            return False

-    # 可选：读取文件的前几KB内容并检查MIME类型
-    # 这一步是可选的，用于更严格的检查
-    # if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
-    #     return False
-
-    # 检查文件内容
-    file_start = file.read(5)
-    file.seek(0)
-    if not file_start.startswith(b'%PDF-'):
-        return False
-
-    return True
+        return True
+        
+    except Exception as e:
+        print(f"Error checking PDF format: {str(e)}")
+        # 发生错误时，仍然尝试通过文件头判断
+        try:
+            file_start = file.read(5)
+            file.seek(0)
+            return file_start.startswith(b'%PDF-')
+        except:
+            return False


 def url_is_pdf(file):
--- a/signatures/version1/cla.json
+++ b/signatures/version1/cla.json
@@ -143,6 +143,14 @@
      "created_at": "2025-01-20T05:30:38Z",
      "repoId": 765083837,
      "pullRequestNo": 1578
+    },
+    {
+      "name": "shniubobo",
+      "id": 6594544,
+      "comment_id": 2660086464,
+      "created_at": "2025-02-14T19:15:25Z",
+      "repoId": 765083837,
+      "pullRequestNo": 1693
    }
  ]
 }