Merge pull request #4660 from myhloli/dev

feat: refactor PDF byte conversion to utilize pdfium for improved per…
This commit is contained in:
Xiaomeng Zhao
2026-03-25 19:02:26 +08:00
committed by GitHub
3 changed files with 91 additions and 30 deletions

View File

@@ -1,12 +1,10 @@
# Copyright (c) Opendatalab. All rights reserved.
import io
import json
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from loguru import logger
from pypdf import PdfReader, PdfWriter
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
@@ -19,7 +17,7 @@ from mineru.backend.office.office_middle_json_mkcontent import union_make as off
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
from mineru.backend.office.docx_analyze import office_docx_analyze
from mineru.utils.pdf_page_id import get_end_page_id
from mineru.utils.pdfium_guard import rewrite_pdf_bytes_with_pdfium
os.environ["TORCH_CUDNN_V8_API_DISABLED"] = "1"
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
@@ -61,29 +59,20 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
def convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id=0, end_page_id=None):
try:
pdf_stream = io.BytesIO(pdf_bytes)
pdf = PdfReader(pdf_stream, strict=False)
page_count = len(pdf.pages)
end_page_id = get_end_page_id(end_page_id, page_count)
# Avoid rewriting when the caller requests the whole document.
if start_page_id <= 0 and end_page_id >= page_count - 1:
return pdf_bytes
output_pdf = PdfWriter()
for page_index in range(start_page_id, end_page_id + 1):
try:
output_pdf.add_page(pdf.pages[page_index])
except Exception as page_error:
logger.warning(f"Failed to import page {page_index}: {page_error}, skipping this page.")
continue
output_buffer = io.BytesIO()
output_pdf.write(output_buffer)
return output_buffer.getvalue()
except Exception as e:
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
return pdf_bytes
rebuilt_pdf_bytes = rewrite_pdf_bytes_with_pdfium(
pdf_bytes,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
if rebuilt_pdf_bytes:
return rebuilt_pdf_bytes
logger.warning("PDFium rewrite returned empty bytes, using original PDF bytes.")
except Exception as fallback_error:
logger.warning(
f"Error in converting PDF bytes with pdfium: {fallback_error}, "
"using original PDF bytes."
)
return pdf_bytes
def _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id):
@@ -125,10 +114,16 @@ def _process_output(
raise Exception(f"Unknown process_mode: {process_mode}")
"""处理输出文件"""
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
try:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
except Exception as exc:
logger.warning(f"Skipping layout bbox visualization for {pdf_file_name}: {exc}")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
try:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
except Exception as exc:
logger.warning(f"Skipping span bbox visualization for {pdf_file_name}: {exc}")
if f_dump_orig_pdf:
if process_mode in ["pipeline", "vlm"]:

View File

@@ -176,11 +176,28 @@ async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True
if is_office:
new_pdf_path = None
else:
new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
new_pdf_path = resolve_preview_pdf_path(local_md_dir, file_name)
return md_content, txt_content, archive_zip_path, new_pdf_path
def resolve_preview_pdf_path(local_md_dir, file_name):
layout_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
if os.path.exists(layout_pdf_path):
return layout_pdf_path
origin_pdf_path = os.path.join(local_md_dir, file_name + '_origin.pdf')
if os.path.exists(origin_pdf_path):
logger.warning(
f"Layout preview PDF not found for {file_name}, "
f"falling back to origin PDF: {origin_pdf_path}"
)
return origin_pdf_path
logger.warning(f"No preview PDF found for {file_name} under {local_md_dir}")
return None
latex_delimiters_type_a = [
{'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False},

View File

@@ -1,6 +1,9 @@
import threading
from io import BytesIO
from contextlib import contextmanager
from typing import Any, Callable, TypeVar
from typing import Any, Callable, Sequence, TypeVar
from mineru.utils.pdf_page_id import get_end_page_id
_pdfium_lock = threading.RLock()
@@ -33,3 +36,49 @@ def close_pdfium_document(pdf_doc) -> None:
return
with pdfium_guard():
pdf_doc.close()
def rewrite_pdf_bytes_with_pdfium(
src_pdf_bytes: bytes,
start_page_id: int = 0,
end_page_id: int | None = None,
page_indices: Sequence[int] | None = None,
) -> bytes:
import pypdfium2 as pdfium
pdf_doc = None
output_doc = None
try:
with pdfium_guard():
pdf_doc = pdfium.PdfDocument(src_pdf_bytes)
total_page_count = len(pdf_doc)
if total_page_count == 0:
return b""
if page_indices is not None:
normalized_page_indices = sorted(
{
page_index
for page_index in page_indices
if 0 <= page_index < total_page_count
}
)
if not normalized_page_indices:
return b""
else:
normalized_end_page_id = get_end_page_id(end_page_id, total_page_count)
normalized_page_indices = list(
range(start_page_id, normalized_end_page_id + 1)
)
output_doc = pdfium.PdfDocument.new()
output_doc.import_pages(pdf_doc, normalized_page_indices)
output_buffer = BytesIO()
output_doc.save(output_buffer)
return output_buffer.getvalue()
finally:
if output_doc is not None:
close_pdfium_document(output_doc)
if pdf_doc is not None:
close_pdfium_document(pdf_doc)