mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 19:18:34 +07:00
Compare commits
9 Commits
add_docx
...
release-2.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b69191ba2b | ||
|
|
0028514ced | ||
|
|
8d8daf6851 | ||
|
|
815280dd23 | ||
|
|
7b52f92aea | ||
|
|
33543b76c9 | ||
|
|
ea5f8e98dd | ||
|
|
8996e06448 | ||
|
|
bfb304ef1f |
@@ -45,6 +45,11 @@
|
||||
|
||||
# Changelog
|
||||
|
||||
- 2026/01/06 2.7.1 Release
|
||||
- fix bug: #4300
|
||||
- Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
|
||||
- Support automatic correction of input image exif orientation to improve OCR recognition accuracy #4283
|
||||
|
||||
- 2025/12/30 2.7.0 Release
|
||||
- Simplified installation process. No need to separately install `vlm` acceleration engine dependencies. Using `uv pip install mineru[all]` during installation will install all optional backend dependencies.
|
||||
- Added new `hybrid` backend, which combines the advantages of `pipeline` and `vlm` backends. Built on vlm, it integrates some capabilities of pipeline, adding extra extensibility on top of high accuracy:
|
||||
|
||||
@@ -45,6 +45,11 @@
|
||||
|
||||
# 更新记录
|
||||
|
||||
- 2026/01/06 2.7.1 发布
|
||||
- fix bug: #4300
|
||||
- 更新pdfminer.six的依赖版本以解决 [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
|
||||
- 支持输入图像的exif方向自动校正,提升OCR识别效果 #4283
|
||||
|
||||
- 2025/12/30 2.7.0 发布
|
||||
- 简化安装流程,现在不再需要单独安装`vlm`加速引擎依赖包,安装时使用`uv pip install mineru[all]`即可安装所有可选后端的依赖包。
|
||||
- 增加全新后端`hybrid`,该后端结合了`pipeline`和`vlm`后端的优势,在vlm的基础上,融入了pipeline的部分能力,在高精度的基础上增加了额外的扩展性:
|
||||
|
||||
@@ -17,8 +17,6 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
||||
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
||||
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
||||
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
||||
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
||||
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
|
||||
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
||||
@@ -326,6 +324,7 @@ def _process_hybrid(
|
||||
server_url=None,
|
||||
**kwargs,
|
||||
):
|
||||
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
||||
"""同步处理hybrid后端逻辑"""
|
||||
if not backend.endswith("client"):
|
||||
server_url = None
|
||||
@@ -378,8 +377,8 @@ async def _async_process_hybrid(
|
||||
server_url=None,
|
||||
**kwargs,
|
||||
):
|
||||
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
||||
"""异步处理hybrid后端逻辑"""
|
||||
|
||||
if not backend.endswith("client"):
|
||||
server_url = None
|
||||
|
||||
|
||||
@@ -232,13 +232,17 @@ def images_bytes_to_pdf_bytes(image_bytes):
|
||||
# 载入并转换所有图像为 RGB 模式
|
||||
image = Image.open(BytesIO(image_bytes))
|
||||
# 根据 EXIF 信息自动转正(处理手机拍摄的带 Orientation 标记的图片)
|
||||
ImageOps.exif_transpose(image, in_place=True)
|
||||
image = ImageOps.exif_transpose(image) or image
|
||||
# 只在必要时转换
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
# 第一张图保存为 PDF,其余追加
|
||||
image.save(pdf_buffer, format="PDF", save_all=True)
|
||||
image.save(
|
||||
pdf_buffer,
|
||||
format="PDF",
|
||||
# save_all=True
|
||||
)
|
||||
|
||||
# 获取 PDF bytes 并重置指针(可选)
|
||||
pdf_bytes = pdf_buffer.getvalue()
|
||||
|
||||
@@ -21,7 +21,7 @@ dependencies = [
|
||||
"click>=8.1.7",
|
||||
"loguru>=0.7.2",
|
||||
"numpy>=1.21.6",
|
||||
"pdfminer.six==20250506",
|
||||
"pdfminer.six>=20251230",
|
||||
"tqdm>=4.67.1",
|
||||
"requests",
|
||||
"httpx",
|
||||
@@ -94,10 +94,10 @@ core = [
|
||||
"mineru[pipeline]",
|
||||
"mineru[api]",
|
||||
"mineru[gradio]",
|
||||
"mineru[mlx] ; sys_platform == 'darwin'",
|
||||
]
|
||||
all = [
|
||||
"mineru[core]",
|
||||
"mineru[mlx] ; sys_platform == 'darwin'",
|
||||
"mineru[vllm] ; sys_platform == 'linux'",
|
||||
"mineru[lmdeploy] ; sys_platform == 'windows'",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user