mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 19:18:34 +07:00
Merge pull request #4660 from myhloli/dev
feat: refactor PDF byte conversion to utilize pdfium for improved per…
This commit is contained in:
@@ -1,12 +1,10 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
|
||||
@@ -19,7 +17,7 @@ from mineru.backend.office.office_middle_json_mkcontent import union_make as off
|
||||
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
||||
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
||||
from mineru.backend.office.docx_analyze import office_docx_analyze
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
from mineru.utils.pdfium_guard import rewrite_pdf_bytes_with_pdfium
|
||||
|
||||
os.environ["TORCH_CUDNN_V8_API_DISABLED"] = "1"
|
||||
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
||||
@@ -61,29 +59,20 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
|
||||
|
||||
def convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id=0, end_page_id=None):
|
||||
try:
|
||||
pdf_stream = io.BytesIO(pdf_bytes)
|
||||
pdf = PdfReader(pdf_stream, strict=False)
|
||||
page_count = len(pdf.pages)
|
||||
end_page_id = get_end_page_id(end_page_id, page_count)
|
||||
|
||||
# Avoid rewriting when the caller requests the whole document.
|
||||
if start_page_id <= 0 and end_page_id >= page_count - 1:
|
||||
return pdf_bytes
|
||||
|
||||
output_pdf = PdfWriter()
|
||||
for page_index in range(start_page_id, end_page_id + 1):
|
||||
try:
|
||||
output_pdf.add_page(pdf.pages[page_index])
|
||||
except Exception as page_error:
|
||||
logger.warning(f"Failed to import page {page_index}: {page_error}, skipping this page.")
|
||||
continue
|
||||
|
||||
output_buffer = io.BytesIO()
|
||||
output_pdf.write(output_buffer)
|
||||
return output_buffer.getvalue()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
|
||||
return pdf_bytes
|
||||
rebuilt_pdf_bytes = rewrite_pdf_bytes_with_pdfium(
|
||||
pdf_bytes,
|
||||
start_page_id=start_page_id,
|
||||
end_page_id=end_page_id,
|
||||
)
|
||||
if rebuilt_pdf_bytes:
|
||||
return rebuilt_pdf_bytes
|
||||
logger.warning("PDFium rewrite returned empty bytes, using original PDF bytes.")
|
||||
except Exception as fallback_error:
|
||||
logger.warning(
|
||||
f"Error in converting PDF bytes with pdfium: {fallback_error}, "
|
||||
"using original PDF bytes."
|
||||
)
|
||||
return pdf_bytes
|
||||
|
||||
|
||||
def _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id):
|
||||
@@ -125,10 +114,16 @@ def _process_output(
|
||||
raise Exception(f"Unknown process_mode: {process_mode}")
|
||||
"""处理输出文件"""
|
||||
if f_draw_layout_bbox:
|
||||
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
|
||||
try:
|
||||
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
|
||||
except Exception as exc:
|
||||
logger.warning(f"Skipping layout bbox visualization for {pdf_file_name}: {exc}")
|
||||
|
||||
if f_draw_span_bbox:
|
||||
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
|
||||
try:
|
||||
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
|
||||
except Exception as exc:
|
||||
logger.warning(f"Skipping span bbox visualization for {pdf_file_name}: {exc}")
|
||||
|
||||
if f_dump_orig_pdf:
|
||||
if process_mode in ["pipeline", "vlm"]:
|
||||
|
||||
@@ -176,11 +176,28 @@ async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True
|
||||
if is_office:
|
||||
new_pdf_path = None
|
||||
else:
|
||||
new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
|
||||
new_pdf_path = resolve_preview_pdf_path(local_md_dir, file_name)
|
||||
|
||||
return md_content, txt_content, archive_zip_path, new_pdf_path
|
||||
|
||||
|
||||
def resolve_preview_pdf_path(local_md_dir, file_name):
|
||||
layout_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
|
||||
if os.path.exists(layout_pdf_path):
|
||||
return layout_pdf_path
|
||||
|
||||
origin_pdf_path = os.path.join(local_md_dir, file_name + '_origin.pdf')
|
||||
if os.path.exists(origin_pdf_path):
|
||||
logger.warning(
|
||||
f"Layout preview PDF not found for {file_name}, "
|
||||
f"falling back to origin PDF: {origin_pdf_path}"
|
||||
)
|
||||
return origin_pdf_path
|
||||
|
||||
logger.warning(f"No preview PDF found for {file_name} under {local_md_dir}")
|
||||
return None
|
||||
|
||||
|
||||
latex_delimiters_type_a = [
|
||||
{'left': '$$', 'right': '$$', 'display': True},
|
||||
{'left': '$', 'right': '$', 'display': False},
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import threading
|
||||
from io import BytesIO
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Callable, TypeVar
|
||||
from typing import Any, Callable, Sequence, TypeVar
|
||||
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
|
||||
|
||||
_pdfium_lock = threading.RLock()
|
||||
@@ -33,3 +36,49 @@ def close_pdfium_document(pdf_doc) -> None:
|
||||
return
|
||||
with pdfium_guard():
|
||||
pdf_doc.close()
|
||||
|
||||
|
||||
def rewrite_pdf_bytes_with_pdfium(
|
||||
src_pdf_bytes: bytes,
|
||||
start_page_id: int = 0,
|
||||
end_page_id: int | None = None,
|
||||
page_indices: Sequence[int] | None = None,
|
||||
) -> bytes:
|
||||
import pypdfium2 as pdfium
|
||||
|
||||
pdf_doc = None
|
||||
output_doc = None
|
||||
try:
|
||||
with pdfium_guard():
|
||||
pdf_doc = pdfium.PdfDocument(src_pdf_bytes)
|
||||
total_page_count = len(pdf_doc)
|
||||
if total_page_count == 0:
|
||||
return b""
|
||||
|
||||
if page_indices is not None:
|
||||
normalized_page_indices = sorted(
|
||||
{
|
||||
page_index
|
||||
for page_index in page_indices
|
||||
if 0 <= page_index < total_page_count
|
||||
}
|
||||
)
|
||||
if not normalized_page_indices:
|
||||
return b""
|
||||
else:
|
||||
normalized_end_page_id = get_end_page_id(end_page_id, total_page_count)
|
||||
normalized_page_indices = list(
|
||||
range(start_page_id, normalized_end_page_id + 1)
|
||||
)
|
||||
|
||||
output_doc = pdfium.PdfDocument.new()
|
||||
output_doc.import_pages(pdf_doc, normalized_page_indices)
|
||||
|
||||
output_buffer = BytesIO()
|
||||
output_doc.save(output_buffer)
|
||||
return output_buffer.getvalue()
|
||||
finally:
|
||||
if output_doc is not None:
|
||||
close_pdfium_document(output_doc)
|
||||
if pdf_doc is not None:
|
||||
close_pdfium_document(pdf_doc)
|
||||
|
||||
Reference in New Issue
Block a user