mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
@@ -7,7 +7,7 @@ from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn, pptx_suffixes, \
|
||||
from mineru.cli.common import convert_pdf_bytes_to_bytes, prepare_env, read_fn, pptx_suffixes, \
|
||||
xlsx_suffixes, pdf_suffixes, image_suffixes, office_suffixes, docx_suffixes
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
|
||||
@@ -72,7 +72,7 @@ def do_parse(
|
||||
|
||||
if backend == "pipeline":
|
||||
for idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
|
||||
new_pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
|
||||
pdf_bytes_list[idx] = new_pdf_bytes
|
||||
|
||||
image_writer_list = []
|
||||
@@ -149,7 +149,7 @@ def do_parse(
|
||||
parse_method = "vlm"
|
||||
for idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
pdf_file_name = pdf_file_names[idx]
|
||||
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
|
||||
pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
||||
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
||||
if is_low_memory_enabled():
|
||||
@@ -174,7 +174,7 @@ def do_parse(
|
||||
parse_method = f"hybrid_{parse_method}"
|
||||
for idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
pdf_file_name = pdf_file_names[idx]
|
||||
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
|
||||
pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
||||
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
||||
if is_low_memory_enabled():
|
||||
|
||||
@@ -60,6 +60,8 @@ def __is_list_or_index_block(block):
|
||||
if block['type'] == BlockType.VERTICAL_TEXT:
|
||||
return BlockType.VERTICAL_TEXT
|
||||
if block['type'] == BlockType.INDEX:
|
||||
for line in block['lines']:
|
||||
line[ListLineTag.IS_LIST_START_LINE] = True
|
||||
return BlockType.INDEX
|
||||
# 一个block如果是list block 应该同时满足以下特征
|
||||
# 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格(狗牙状)
|
||||
|
||||
@@ -2,13 +2,11 @@
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import copy
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
import pypdfium2 as pdfium
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
|
||||
@@ -39,8 +37,6 @@ office_suffixes = docx_suffixes + pptx_suffixes + xlsx_suffixes
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
_pdf_rewrite_lock = threading.Lock()
|
||||
|
||||
def read_fn(path, file_suffix: str | None = None):
|
||||
if not isinstance(path, Path):
|
||||
path = Path(path)
|
||||
@@ -64,54 +60,38 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
|
||||
return local_image_dir, local_md_dir
|
||||
|
||||
|
||||
def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
|
||||
# pypdfium2 document import/save is not thread-safe across concurrent FastAPI tasks.
|
||||
with _pdf_rewrite_lock:
|
||||
pdf = None
|
||||
output_pdf = None
|
||||
try:
|
||||
pdf = pdfium.PdfDocument(pdf_bytes)
|
||||
page_count = len(pdf)
|
||||
end_page_id = get_end_page_id(end_page_id, page_count)
|
||||
def convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id=0, end_page_id=None):
|
||||
try:
|
||||
pdf_stream = io.BytesIO(pdf_bytes)
|
||||
pdf = PdfReader(pdf_stream, strict=False)
|
||||
page_count = len(pdf.pages)
|
||||
end_page_id = get_end_page_id(end_page_id, page_count)
|
||||
|
||||
# Avoid rewriting when the caller requests the whole document.
|
||||
if start_page_id <= 0 and end_page_id >= page_count - 1:
|
||||
return pdf_bytes
|
||||
|
||||
output_pdf = pdfium.PdfDocument.new()
|
||||
|
||||
# 逐页导入,失败则跳过
|
||||
output_index = 0
|
||||
for page_index in range(start_page_id, end_page_id + 1):
|
||||
try:
|
||||
output_pdf.import_pages(pdf, pages=[page_index])
|
||||
output_index += 1
|
||||
except Exception as page_error:
|
||||
output_pdf.del_page(output_index)
|
||||
logger.warning(f"Failed to import page {page_index}: {page_error}, skipping this page.")
|
||||
continue
|
||||
|
||||
# 将新PDF保存到内存缓冲区
|
||||
output_buffer = io.BytesIO()
|
||||
output_pdf.save(output_buffer)
|
||||
|
||||
# 获取字节数据
|
||||
return output_buffer.getvalue()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
|
||||
# Avoid rewriting when the caller requests the whole document.
|
||||
if start_page_id <= 0 and end_page_id >= page_count - 1:
|
||||
return pdf_bytes
|
||||
finally:
|
||||
if pdf is not None:
|
||||
pdf.close()
|
||||
if output_pdf is not None:
|
||||
output_pdf.close()
|
||||
|
||||
output_pdf = PdfWriter()
|
||||
for page_index in range(start_page_id, end_page_id + 1):
|
||||
try:
|
||||
output_pdf.add_page(pdf.pages[page_index])
|
||||
except Exception as page_error:
|
||||
logger.warning(f"Failed to import page {page_index}: {page_error}, skipping this page.")
|
||||
continue
|
||||
|
||||
output_buffer = io.BytesIO()
|
||||
output_pdf.write(output_buffer)
|
||||
return output_buffer.getvalue()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
|
||||
return pdf_bytes
|
||||
|
||||
|
||||
def _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id):
|
||||
"""准备处理PDF字节数据"""
|
||||
result = []
|
||||
for pdf_bytes in pdf_bytes_list:
|
||||
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
|
||||
new_pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
|
||||
result.append(new_pdf_bytes)
|
||||
return result
|
||||
|
||||
|
||||
@@ -353,7 +353,7 @@ def get_mfr_effective_batch_size(num_items: int, requested_batch_size: int) -> i
|
||||
|
||||
|
||||
def get_mfr_min_dynamic_batch_size(requested_batch_size: int) -> int:
|
||||
return max(1, requested_batch_size // 4)
|
||||
return max(16, requested_batch_size // 4)
|
||||
|
||||
|
||||
def finalize_mfr_batch_groups(
|
||||
|
||||
@@ -1,9 +1,14 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from mineru.utils.check_sys_env import is_mac_os_version_supported, is_windows_environment, is_mac_environment, \
|
||||
is_linux_environment
|
||||
|
||||
DISABLE_VLM_ACCELERATION_ENV = "MINERU_DISABLE_VLM_ACCELERATION"
|
||||
_TRUE_VALUES = ("1", "true", "yes", "on")
|
||||
|
||||
|
||||
def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
|
||||
"""
|
||||
@@ -16,7 +21,13 @@ def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
|
||||
Returns:
|
||||
最终选择的引擎名称
|
||||
"""
|
||||
if inference_engine == 'auto':
|
||||
if _is_vlm_acceleration_disabled():
|
||||
if inference_engine != 'transformers':
|
||||
logger.info(
|
||||
f"{DISABLE_VLM_ACCELERATION_ENV} is enabled, forcing VLM inference engine to transformers."
|
||||
)
|
||||
inference_engine = 'transformers'
|
||||
elif inference_engine == 'auto':
|
||||
# 根据操作系统自动选择引擎
|
||||
if is_windows_environment():
|
||||
inference_engine = _select_windows_engine()
|
||||
@@ -33,6 +44,11 @@ def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
|
||||
return formatted_engine
|
||||
|
||||
|
||||
def _is_vlm_acceleration_disabled() -> bool:
|
||||
value = os.getenv(DISABLE_VLM_ACCELERATION_ENV, "0")
|
||||
return value.strip().lower() in _TRUE_VALUES
|
||||
|
||||
|
||||
def _select_windows_engine() -> str:
|
||||
"""Windows 平台引擎选择"""
|
||||
try:
|
||||
|
||||
@@ -7,7 +7,7 @@ from loguru import logger
|
||||
from bs4 import BeautifulSoup
|
||||
from fuzzywuzzy import fuzz
|
||||
from mineru.cli.common import (
|
||||
convert_pdf_bytes_to_bytes_by_pypdfium2,
|
||||
convert_pdf_bytes_to_bytes,
|
||||
prepare_env,
|
||||
read_fn,
|
||||
)
|
||||
@@ -48,7 +48,7 @@ def test_pipeline_with_two_config():
|
||||
pdf_bytes_list.append(pdf_bytes)
|
||||
p_lang_list.append("en")
|
||||
for idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
|
||||
new_pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes)
|
||||
pdf_bytes_list[idx] = new_pdf_bytes
|
||||
|
||||
# 获取 pipline 分析结果, 分别测试 txt 和 ocr 两种解析方法的结果
|
||||
@@ -122,7 +122,7 @@ def test_pipeline_with_two_config():
|
||||
#
|
||||
# for idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
# pdf_file_name = pdf_file_names[idx]
|
||||
# pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
|
||||
# pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes)
|
||||
# local_image_dir, local_md_dir = prepare_env(
|
||||
# output_dir, pdf_file_name, parse_method="vlm"
|
||||
# )
|
||||
|
||||
Reference in New Issue
Block a user