mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
fix: extend suffix guessing logic to include HTML files for PDF detection
This commit is contained in:
@@ -14,7 +14,7 @@ def guess_language_by_text(code):
|
||||
|
||||
def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
|
||||
suffix = magika.identify_bytes(file_bytes).prediction.output.label
|
||||
if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
|
||||
if file_path and suffix in ["ai", "html"] and Path(file_path).suffix.lower() in [".pdf"]:
|
||||
suffix = "pdf"
|
||||
return suffix
|
||||
|
||||
@@ -23,6 +23,6 @@ def guess_suffix_by_path(file_path) -> str:
|
||||
if not isinstance(file_path, Path):
|
||||
file_path = Path(file_path)
|
||||
suffix = magika.identify_path(file_path).prediction.output.label
|
||||
if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
|
||||
if suffix in ["ai", "html"] and file_path.suffix.lower() in [".pdf"]:
|
||||
suffix = "pdf"
|
||||
return suffix
|
||||
Reference in New Issue
Block a user