fix: extend suffix guessing logic to include HTML files for PDF detection

2026-03-27 11:08:32 +07:00 · 2025-12-12 16:14:22 +08:00
parent a4b07a0d1f
commit 6fc343b14e
1 changed files with 2 additions and 2 deletions
--- a/mineru/utils/guess_suffix_or_lang.py
+++ b/mineru/utils/guess_suffix_or_lang.py
@@ -14,7 +14,7 @@ def guess_language_by_text(code):

 def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
    suffix = magika.identify_bytes(file_bytes).prediction.output.label
-    if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
+    if file_path and suffix in ["ai", "html"] and Path(file_path).suffix.lower() in [".pdf"]:
        suffix = "pdf"
    return suffix

@@ -23,6 +23,6 @@ def guess_suffix_by_path(file_path) -> str:
    if not isinstance(file_path, Path):
        file_path = Path(file_path)
    suffix = magika.identify_path(file_path).prediction.output.label
-    if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
+    if suffix in ["ai", "html"] and file_path.suffix.lower() in [".pdf"]:
        suffix = "pdf"
    return suffix