Merge pull request #4651 from myhloli/dev

Dev
This commit is contained in:
Xiaomeng Zhao
2026-03-24 16:54:30 +08:00
committed by GitHub
6 changed files with 52 additions and 54 deletions

View File

@@ -7,7 +7,7 @@ from pathlib import Path
from loguru import logger
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn, pptx_suffixes, \
from mineru.cli.common import convert_pdf_bytes_to_bytes, prepare_env, read_fn, pptx_suffixes, \
xlsx_suffixes, pdf_suffixes, image_suffixes, office_suffixes, docx_suffixes
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
@@ -72,7 +72,7 @@ def do_parse(
if backend == "pipeline":
for idx, pdf_bytes in enumerate(pdf_bytes_list):
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
new_pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
pdf_bytes_list[idx] = new_pdf_bytes
image_writer_list = []
@@ -149,7 +149,7 @@ def do_parse(
parse_method = "vlm"
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
if is_low_memory_enabled():
@@ -174,7 +174,7 @@ def do_parse(
parse_method = f"hybrid_{parse_method}"
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
if is_low_memory_enabled():

View File

@@ -60,6 +60,8 @@ def __is_list_or_index_block(block):
if block['type'] == BlockType.VERTICAL_TEXT:
return BlockType.VERTICAL_TEXT
if block['type'] == BlockType.INDEX:
for line in block['lines']:
line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.INDEX
# 一个block如果是list block 应该同时满足以下特征
# 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格(狗牙状)

View File

@@ -2,13 +2,11 @@
import io
import json
import os
import copy
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from loguru import logger
import pypdfium2 as pdfium
from pypdf import PdfReader, PdfWriter
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
@@ -39,8 +37,6 @@ office_suffixes = docx_suffixes + pptx_suffixes + xlsx_suffixes
os.environ["TOKENIZERS_PARALLELISM"] = "false"
_pdf_rewrite_lock = threading.Lock()
def read_fn(path, file_suffix: str | None = None):
if not isinstance(path, Path):
path = Path(path)
@@ -64,54 +60,38 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
return local_image_dir, local_md_dir
def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
# pypdfium2 document import/save is not thread-safe across concurrent FastAPI tasks.
with _pdf_rewrite_lock:
pdf = None
output_pdf = None
try:
pdf = pdfium.PdfDocument(pdf_bytes)
page_count = len(pdf)
end_page_id = get_end_page_id(end_page_id, page_count)
def convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id=0, end_page_id=None):
try:
pdf_stream = io.BytesIO(pdf_bytes)
pdf = PdfReader(pdf_stream, strict=False)
page_count = len(pdf.pages)
end_page_id = get_end_page_id(end_page_id, page_count)
# Avoid rewriting when the caller requests the whole document.
if start_page_id <= 0 and end_page_id >= page_count - 1:
return pdf_bytes
output_pdf = pdfium.PdfDocument.new()
# 逐页导入,失败则跳过
output_index = 0
for page_index in range(start_page_id, end_page_id + 1):
try:
output_pdf.import_pages(pdf, pages=[page_index])
output_index += 1
except Exception as page_error:
output_pdf.del_page(output_index)
logger.warning(f"Failed to import page {page_index}: {page_error}, skipping this page.")
continue
# 将新PDF保存到内存缓冲区
output_buffer = io.BytesIO()
output_pdf.save(output_buffer)
# 获取字节数据
return output_buffer.getvalue()
except Exception as e:
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
# Avoid rewriting when the caller requests the whole document.
if start_page_id <= 0 and end_page_id >= page_count - 1:
return pdf_bytes
finally:
if pdf is not None:
pdf.close()
if output_pdf is not None:
output_pdf.close()
output_pdf = PdfWriter()
for page_index in range(start_page_id, end_page_id + 1):
try:
output_pdf.add_page(pdf.pages[page_index])
except Exception as page_error:
logger.warning(f"Failed to import page {page_index}: {page_error}, skipping this page.")
continue
output_buffer = io.BytesIO()
output_pdf.write(output_buffer)
return output_buffer.getvalue()
except Exception as e:
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
return pdf_bytes
def _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id):
"""准备处理PDF字节数据"""
result = []
for pdf_bytes in pdf_bytes_list:
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
new_pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
result.append(new_pdf_bytes)
return result

View File

@@ -353,7 +353,7 @@ def get_mfr_effective_batch_size(num_items: int, requested_batch_size: int) -> i
def get_mfr_min_dynamic_batch_size(requested_batch_size: int) -> int:
return max(1, requested_batch_size // 4)
return max(16, requested_batch_size // 4)
def finalize_mfr_batch_groups(

View File

@@ -1,9 +1,14 @@
# Copyright (c) Opendatalab. All rights reserved.
import os
from loguru import logger
from mineru.utils.check_sys_env import is_mac_os_version_supported, is_windows_environment, is_mac_environment, \
is_linux_environment
DISABLE_VLM_ACCELERATION_ENV = "MINERU_DISABLE_VLM_ACCELERATION"
_TRUE_VALUES = ("1", "true", "yes", "on")
def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
"""
@@ -16,7 +21,13 @@ def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
Returns:
最终选择的引擎名称
"""
if inference_engine == 'auto':
if _is_vlm_acceleration_disabled():
if inference_engine != 'transformers':
logger.info(
f"{DISABLE_VLM_ACCELERATION_ENV} is enabled, forcing VLM inference engine to transformers."
)
inference_engine = 'transformers'
elif inference_engine == 'auto':
# 根据操作系统自动选择引擎
if is_windows_environment():
inference_engine = _select_windows_engine()
@@ -33,6 +44,11 @@ def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
return formatted_engine
def _is_vlm_acceleration_disabled() -> bool:
value = os.getenv(DISABLE_VLM_ACCELERATION_ENV, "0")
return value.strip().lower() in _TRUE_VALUES
def _select_windows_engine() -> str:
"""Windows 平台引擎选择"""
try:

View File

@@ -7,7 +7,7 @@ from loguru import logger
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from mineru.cli.common import (
convert_pdf_bytes_to_bytes_by_pypdfium2,
convert_pdf_bytes_to_bytes,
prepare_env,
read_fn,
)
@@ -48,7 +48,7 @@ def test_pipeline_with_two_config():
pdf_bytes_list.append(pdf_bytes)
p_lang_list.append("en")
for idx, pdf_bytes in enumerate(pdf_bytes_list):
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
new_pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes)
pdf_bytes_list[idx] = new_pdf_bytes
# 获取 pipline 分析结果, 分别测试 txt 和 ocr 两种解析方法的结果
@@ -122,7 +122,7 @@ def test_pipeline_with_two_config():
#
# for idx, pdf_bytes in enumerate(pdf_bytes_list):
# pdf_file_name = pdf_file_names[idx]
# pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
# pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes)
# local_image_dir, local_md_dir = prepare_env(
# output_dir, pdf_file_name, parse_method="vlm"
# )