mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat: simplify PDF byte conversion by integrating page range handling directly in rewrite function
This commit is contained in:
@@ -17,13 +17,7 @@ from mineru.backend.office.office_middle_json_mkcontent import union_make as off
|
||||
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
||||
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
||||
from mineru.backend.office.docx_analyze import office_docx_analyze
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
from mineru.utils.pdfium_guard import (
|
||||
close_pdfium_document,
|
||||
get_pdfium_document_page_count,
|
||||
open_pdfium_document,
|
||||
rewrite_pdf_bytes_with_pdfium,
|
||||
)
|
||||
from mineru.utils.pdfium_guard import rewrite_pdf_bytes_with_pdfium
|
||||
|
||||
os.environ["TORCH_CUDNN_V8_API_DISABLED"] = "1"
|
||||
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
||||
@@ -63,38 +57,19 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
|
||||
return local_image_dir, local_md_dir
|
||||
|
||||
|
||||
def _build_requested_page_indices(start_page_id, end_page_id, page_count):
|
||||
if page_count <= 0:
|
||||
return []
|
||||
|
||||
normalized_end_page_id = get_end_page_id(end_page_id, page_count)
|
||||
if start_page_id <= 0 and normalized_end_page_id >= page_count - 1:
|
||||
return None
|
||||
return list(range(start_page_id, normalized_end_page_id + 1))
|
||||
|
||||
|
||||
def _get_pdfium_page_count(pdf_bytes):
|
||||
import pypdfium2 as pdfium
|
||||
|
||||
pdf_doc = None
|
||||
try:
|
||||
pdf_doc = open_pdfium_document(pdfium.PdfDocument, pdf_bytes)
|
||||
return get_pdfium_document_page_count(pdf_doc)
|
||||
finally:
|
||||
close_pdfium_document(pdf_doc)
|
||||
|
||||
|
||||
def convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id=0, end_page_id=None):
|
||||
try:
|
||||
page_count = _get_pdfium_page_count(pdf_bytes)
|
||||
page_indices = _build_requested_page_indices(start_page_id, end_page_id, page_count)
|
||||
rebuilt_pdf_bytes = rewrite_pdf_bytes_with_pdfium(pdf_bytes, page_indices=page_indices)
|
||||
rebuilt_pdf_bytes = rewrite_pdf_bytes_with_pdfium(
|
||||
pdf_bytes,
|
||||
start_page_id=start_page_id,
|
||||
end_page_id=end_page_id,
|
||||
)
|
||||
if rebuilt_pdf_bytes:
|
||||
return rebuilt_pdf_bytes
|
||||
logger.warning("PDFium rewrite returned empty bytes, using original PDF bytes.")
|
||||
except Exception as fallback_error:
|
||||
logger.warning(
|
||||
f"Error in converting PDF bytes with pdfium fallback: {fallback_error}, "
|
||||
f"Error in converting PDF bytes with pdfium: {fallback_error}, "
|
||||
"using original PDF bytes."
|
||||
)
|
||||
return pdf_bytes
|
||||
|
||||
@@ -3,6 +3,8 @@ from io import BytesIO
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Callable, Sequence, TypeVar
|
||||
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
|
||||
|
||||
_pdfium_lock = threading.RLock()
|
||||
|
||||
@@ -38,6 +40,8 @@ def close_pdfium_document(pdf_doc) -> None:
|
||||
|
||||
def rewrite_pdf_bytes_with_pdfium(
|
||||
src_pdf_bytes: bytes,
|
||||
start_page_id: int = 0,
|
||||
end_page_id: int | None = None,
|
||||
page_indices: Sequence[int] | None = None,
|
||||
) -> bytes:
|
||||
import pypdfium2 as pdfium
|
||||
@@ -51,9 +55,7 @@ def rewrite_pdf_bytes_with_pdfium(
|
||||
if total_page_count == 0:
|
||||
return b""
|
||||
|
||||
if page_indices is None:
|
||||
normalized_page_indices = list(range(total_page_count))
|
||||
else:
|
||||
if page_indices is not None:
|
||||
normalized_page_indices = sorted(
|
||||
{
|
||||
page_index
|
||||
@@ -63,6 +65,11 @@ def rewrite_pdf_bytes_with_pdfium(
|
||||
)
|
||||
if not normalized_page_indices:
|
||||
return b""
|
||||
else:
|
||||
normalized_end_page_id = get_end_page_id(end_page_id, total_page_count)
|
||||
normalized_page_indices = list(
|
||||
range(start_page_id, normalized_end_page_id + 1)
|
||||
)
|
||||
|
||||
output_doc = pdfium.PdfDocument.new()
|
||||
output_doc.import_pages(pdf_doc, normalized_page_indices)
|
||||
|
||||
Reference in New Issue
Block a user