mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 19:18:34 +07:00
Compare commits
15 Commits
release-2.
...
release-2.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
80dc57e7ce | ||
|
|
d84a006f6d | ||
|
|
2c5361bf8e | ||
|
|
eb01b7acf9 | ||
|
|
5656f1363b | ||
|
|
c9315b8e10 | ||
|
|
907099762f | ||
|
|
2c356cccee | ||
|
|
0f62f166e6 | ||
|
|
c7a64e72dc | ||
|
|
3cb3a94830 | ||
|
|
8301fa4c20 | ||
|
|
4400f4b75f | ||
|
|
92efb8f96e | ||
|
|
9a88cbfb09 |
@@ -44,6 +44,9 @@
|
||||
|
||||
# Changelog
|
||||
|
||||
- 2025/09/26 2.5.4 released
|
||||
- Fixed an issue where some `PDF` files were mistakenly identified as `AI` files, causing parsing failures
|
||||
|
||||
- 2025/09/20 2.5.3 Released
|
||||
- Dependency version range adjustment to enable Turing and earlier architecture GPUs to use vLLM acceleration for MinerU2.5 model inference.
|
||||
- `pipeline` backend compatibility fixes for torch 2.8.0.
|
||||
|
||||
@@ -43,6 +43,10 @@
|
||||
</div>
|
||||
|
||||
# 更新记录
|
||||
|
||||
- 2025/09/26 2.5.4 发布
|
||||
- 修复部分`pdf`文件被识别成`ai`文件导致无法解析的问题
|
||||
|
||||
- 2025/09/20 2.5.3 发布
|
||||
- 依赖版本范围调整,使得Turing及更早架构显卡可以使用vLLM加速推理MinerU2.5模型。
|
||||
- `pipeline`后端对torch 2.8.0的一些兼容性修复。
|
||||
|
||||
@@ -26,7 +26,7 @@ def read_fn(path):
|
||||
path = Path(path)
|
||||
with open(str(path), "rb") as input_file:
|
||||
file_bytes = input_file.read()
|
||||
file_suffix = guess_suffix_by_bytes(file_bytes)
|
||||
file_suffix = guess_suffix_by_bytes(file_bytes, path)
|
||||
if file_suffix in image_suffixes:
|
||||
return images_bytes_to_pdf_bytes(file_bytes)
|
||||
elif file_suffix in pdf_suffixes:
|
||||
|
||||
@@ -86,10 +86,14 @@ def replace_image_with_base64(markdown_text, image_dir_path):
|
||||
# 替换图片链接
|
||||
def replace(match):
|
||||
relative_path = match.group(1)
|
||||
full_path = os.path.join(image_dir_path, relative_path)
|
||||
base64_image = image_to_base64(full_path)
|
||||
return f''
|
||||
|
||||
# 只处理以.jpg结尾的图片
|
||||
if relative_path.endswith('.jpg'):
|
||||
full_path = os.path.join(image_dir_path, relative_path)
|
||||
base64_image = image_to_base64(full_path)
|
||||
return f''
|
||||
else:
|
||||
# 其他格式的图片保持原样
|
||||
return match.group(0)
|
||||
# 应用替换
|
||||
return re.sub(pattern, replace, markdown_text)
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from pathlib import Path
|
||||
|
||||
from magika import Magika
|
||||
|
||||
|
||||
@@ -10,11 +12,17 @@ def guess_language_by_text(code):
|
||||
return lang if lang != "unknown" else DEFAULT_LANG
|
||||
|
||||
|
||||
def guess_suffix_by_bytes(file_bytes) -> str:
|
||||
def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
|
||||
suffix = magika.identify_bytes(file_bytes).prediction.output.label
|
||||
if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
|
||||
suffix = "pdf"
|
||||
return suffix
|
||||
|
||||
|
||||
def guess_suffix_by_path(file_path) -> str:
|
||||
if not isinstance(file_path, Path):
|
||||
file_path = Path(file_path)
|
||||
suffix = magika.identify_path(file_path).prediction.output.label
|
||||
if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
|
||||
suffix = "pdf"
|
||||
return suffix
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.5.2"
|
||||
__version__ = "2.5.3"
|
||||
|
||||
@@ -39,7 +39,7 @@ dependencies = [
|
||||
"openai>=1.70.0,<2",
|
||||
"beautifulsoup4>=4.13.5,<5",
|
||||
"magika>=0.6.2,<0.7.0",
|
||||
"mineru-vl-utils>=0.1.8,<1",
|
||||
"mineru-vl-utils>=0.1.11,<1",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -154,10 +154,9 @@ def test_vlm_transformers_with_default_config():
|
||||
json.dumps(middle_json, ensure_ascii=False, indent=4),
|
||||
)
|
||||
|
||||
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_model_output.txt",
|
||||
model_output,
|
||||
f"{pdf_file_name}_model.json",
|
||||
json.dumps(infer_result, ensure_ascii=False, indent=4),
|
||||
)
|
||||
|
||||
logger.info(f"local output dir is {local_md_dir}")
|
||||
|
||||
Reference in New Issue
Block a user