Compare commits

..

15 Commits

Author SHA1 Message Date
Xiaomeng Zhao
80dc57e7ce Merge pull request #3609 from myhloli/dev
Bump mineru-vl-utils dependency to version 0.1.11
2025-09-26 05:48:32 +08:00
myhloli
d84a006f6d Bump mineru-vl-utils dependency to version 0.1.11 2025-09-26 05:47:27 +08:00
Xiaomeng Zhao
2c5361bf8e Merge pull request #3607 from myhloli/dev
Update changelog for version 2.5.4 to document PDF identification fix
2025-09-26 05:43:50 +08:00
myhloli
eb01b7acf9 Update changelog for version 2.5.4 to document PDF identification fix 2025-09-26 05:42:43 +08:00
Xiaomeng Zhao
5656f1363b Merge pull request #3606 from myhloli/dev
Dev
2025-09-26 05:35:29 +08:00
myhloli
c9315b8e10 Refactor suffix guessing to handle PDF extensions for AI files 2025-09-26 05:31:46 +08:00
myhloli
907099762f Normalize PDF suffix handling for AI files to be case-insensitive 2025-09-26 05:09:19 +08:00
myhloli
2c356cccee Fix suffix identification for AI files to correctly handle PDF extensions 2025-09-26 05:02:56 +08:00
myhloli
0f62f166e6 Enhance image link replacement to handle only .jpg files while preserving other formats 2025-09-26 04:52:05 +08:00
Xiaomeng Zhao
c7a64e72dc Merge pull request #3563 from myhloli/dev
Update model output handling in test_e2e.py to write JSON format instead of text
2025-09-21 02:49:31 +08:00
myhloli
3cb3a94830 Merge remote-tracking branch 'origin/dev' into dev 2025-09-21 02:48:45 +08:00
myhloli
8301fa4c20 Update model output handling in test_e2e.py to write JSON format instead of text 2025-09-21 02:47:56 +08:00
Xiaomeng Zhao
4400f4b75f Merge pull request #3558 from opendatalab/master
master->dev
2025-09-20 15:37:45 +08:00
myhloli
92efb8f96e Update version.py with new version 2025-09-20 07:36:01 +00:00
Xiaomeng Zhao
9a88cbfb09 Merge pull request #3545 from opendatalab/release-2.5.3
Release 2.5.3
2025-09-20 15:33:58 +08:00
8 changed files with 29 additions and 11 deletions

View File

@@ -44,6 +44,9 @@
# Changelog
- 2025/09/26 2.5.4 released
- Fixed an issue where some `PDF` files were mistakenly identified as `AI` files, causing parsing failures
- 2025/09/20 2.5.3 Released
- Dependency version range adjustment to enable Turing and earlier architecture GPUs to use vLLM acceleration for MinerU2.5 model inference.
- `pipeline` backend compatibility fixes for torch 2.8.0.

View File

@@ -43,6 +43,10 @@
</div>
# 更新记录
- 2025/09/26 2.5.4 发布
- 修复部分`pdf`文件被识别成`ai`文件导致无法解析的问题
- 2025/09/20 2.5.3 发布
- 依赖版本范围调整使得Turing及更早架构显卡可以使用vLLM加速推理MinerU2.5模型。
- `pipeline`后端对torch 2.8.0的一些兼容性修复。

View File

@@ -26,7 +26,7 @@ def read_fn(path):
path = Path(path)
with open(str(path), "rb") as input_file:
file_bytes = input_file.read()
file_suffix = guess_suffix_by_bytes(file_bytes)
file_suffix = guess_suffix_by_bytes(file_bytes, path)
if file_suffix in image_suffixes:
return images_bytes_to_pdf_bytes(file_bytes)
elif file_suffix in pdf_suffixes:

View File

@@ -86,10 +86,14 @@ def replace_image_with_base64(markdown_text, image_dir_path):
# 替换图片链接
def replace(match):
relative_path = match.group(1)
full_path = os.path.join(image_dir_path, relative_path)
base64_image = image_to_base64(full_path)
return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
# 只处理以.jpg结尾的图片
if relative_path.endswith('.jpg'):
full_path = os.path.join(image_dir_path, relative_path)
base64_image = image_to_base64(full_path)
return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
else:
# 其他格式的图片保持原样
return match.group(0)
# 应用替换
return re.sub(pattern, replace, markdown_text)

View File

@@ -1,3 +1,5 @@
from pathlib import Path
from magika import Magika
@@ -10,11 +12,17 @@ def guess_language_by_text(code):
return lang if lang != "unknown" else DEFAULT_LANG
def guess_suffix_by_bytes(file_bytes) -> str:
def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
suffix = magika.identify_bytes(file_bytes).prediction.output.label
if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
suffix = "pdf"
return suffix
def guess_suffix_by_path(file_path) -> str:
if not isinstance(file_path, Path):
file_path = Path(file_path)
suffix = magika.identify_path(file_path).prediction.output.label
if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
suffix = "pdf"
return suffix

View File

@@ -1 +1 @@
__version__ = "2.5.2"
__version__ = "2.5.3"

View File

@@ -39,7 +39,7 @@ dependencies = [
"openai>=1.70.0,<2",
"beautifulsoup4>=4.13.5,<5",
"magika>=0.6.2,<0.7.0",
"mineru-vl-utils>=0.1.8,<1",
"mineru-vl-utils>=0.1.11,<1",
]
[project.optional-dependencies]

View File

@@ -154,10 +154,9 @@ def test_vlm_transformers_with_default_config():
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
f"{pdf_file_name}_model.json",
json.dumps(infer_result, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")