Files
MinerU/mineru/cli/common.py

717 lines
25 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Copyright (c) Opendatalab. All rights reserved.
import json
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Sequence
from loguru import logger
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.engine_utils import get_vlm_engine
from mineru.utils.enum_class import MakeMode
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.backend.office.office_middle_json_mkcontent import union_make as office_union_make
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
from mineru.backend.office.docx_analyze import office_docx_analyze
from mineru.utils.pdfium_guard import rewrite_pdf_bytes_with_pdfium
os.environ["TORCH_CUDNN_V8_API_DISABLED"] = "1"
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
import torch
torch.backends.cudnn.enabled = False
pdf_suffixes = ["pdf"]
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
docx_suffixes = ["docx"]
pptx_suffixes = ["pptx"]
xlsx_suffixes = ["xlsx"]
office_suffixes = docx_suffixes + pptx_suffixes + xlsx_suffixes
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def uniquify_task_stems(
stems: Sequence[str],
) -> tuple[list[str], list[tuple[str, str]]]:
"""Assign task-local unique stems while preserving input order."""
raw_keys = {stem.casefold() for stem in stems}
occurrence_counts: dict[str, int] = {}
assigned_keys: set[str] = set()
unique_stems: list[str] = []
renamed: list[tuple[str, str]] = []
for stem in stems:
stem_key = stem.casefold()
seen_count = occurrence_counts.get(stem_key, 0)
occurrence_counts[stem_key] = seen_count + 1
if seen_count == 0 and stem_key not in assigned_keys:
effective_stem = stem
else:
suffix = seen_count + 1
while True:
candidate = f"{stem}_{suffix}"
candidate_key = candidate.casefold()
if candidate_key not in raw_keys and candidate_key not in assigned_keys:
effective_stem = candidate
break
suffix += 1
assigned_keys.add(effective_stem.casefold())
unique_stems.append(effective_stem)
if effective_stem != stem:
renamed.append((stem, effective_stem))
return unique_stems, renamed
def read_fn(path, file_suffix: str | None = None):
if not isinstance(path, Path):
path = Path(path)
with open(str(path), "rb") as input_file:
file_bytes = input_file.read()
if file_suffix is None:
file_suffix = guess_suffix_by_bytes(file_bytes, path)
if file_suffix in image_suffixes:
return images_bytes_to_pdf_bytes(file_bytes)
elif file_suffix in pdf_suffixes + office_suffixes:
return file_bytes
else:
raise Exception(f"Unknown file suffix: {file_suffix}")
def prepare_env(output_dir, pdf_file_name, parse_method):
local_md_dir = str(os.path.join(output_dir, pdf_file_name, parse_method))
local_image_dir = os.path.join(str(local_md_dir), "images")
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id=0, end_page_id=None):
try:
rebuilt_pdf_bytes = rewrite_pdf_bytes_with_pdfium(
pdf_bytes,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
if rebuilt_pdf_bytes:
return rebuilt_pdf_bytes
logger.warning("PDFium rewrite returned empty bytes, using original PDF bytes.")
except Exception as fallback_error:
logger.warning(
f"Error in converting PDF bytes with pdfium: {fallback_error}, "
"using original PDF bytes."
)
return pdf_bytes
def _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id):
"""准备处理PDF字节数据"""
result = []
for pdf_bytes in pdf_bytes_list:
new_pdf_bytes = convert_pdf_bytes_to_bytes(pdf_bytes, start_page_id, end_page_id)
result.append(new_pdf_bytes)
return result
def _process_output(
pdf_info,
pdf_bytes,
pdf_file_name,
local_md_dir,
local_image_dir,
md_writer,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_orig_pdf,
f_dump_md,
f_dump_content_list,
f_dump_middle_json,
f_dump_model_output,
f_make_md_mode,
middle_json,
model_output=None,
process_mode="vlm",
):
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
if process_mode == "pipeline":
make_func = pipeline_union_make
elif process_mode == "vlm":
make_func = vlm_union_make
elif process_mode in office_suffixes:
make_func = office_union_make
else:
raise Exception(f"Unknown process_mode: {process_mode}")
"""处理输出文件"""
if f_draw_layout_bbox:
try:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
except Exception as exc:
logger.warning(f"Skipping layout bbox visualization for {pdf_file_name}: {exc}")
if f_draw_span_bbox:
try:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
except Exception as exc:
logger.warning(f"Skipping span bbox visualization for {pdf_file_name}: {exc}")
if f_dump_orig_pdf:
if process_mode in ["pipeline", "vlm"]:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
elif process_mode in office_suffixes:
md_writer.write(
f"{pdf_file_name}_origin.{process_mode}",
pdf_bytes,
)
image_dir = str(os.path.basename(local_image_dir))
if f_dump_md:
md_content_str = make_func(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list_v2.json",
json.dumps(content_list_v2, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_output, ensure_ascii=False, indent=4),
)
logger.debug(f"local output dir is {local_md_dir}")
def _process_pipeline(
output_dir,
pdf_file_names,
pdf_bytes_list,
p_lang_list,
parse_method,
p_formula_enable,
p_table_enable,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_md,
f_dump_middle_json,
f_dump_model_output,
f_dump_orig_pdf,
f_dump_content_list,
f_make_md_mode,
):
"""处理pipeline后端逻辑"""
from mineru.backend.pipeline.pipeline_analyze import doc_analyze_streaming as pipeline_doc_analyze_streaming
image_writer_list = []
md_writer_list = []
local_output_info = []
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
image_writer_list.append(image_writer)
md_writer_list.append(md_writer)
local_output_info.append((pdf_file_name, local_image_dir, local_md_dir))
output_futures = []
def run_output_task(doc_index, middle_json, model_list):
pdf_file_name, local_image_dir, local_md_dir = local_output_info[doc_index]
md_writer = md_writer_list[doc_index]
pdf_bytes = pdf_bytes_list[doc_index]
logger.debug(f"Pipeline output start: doc{doc_index}")
try:
_process_output(
middle_json["pdf_info"], pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, model_list, process_mode="pipeline"
)
logger.debug(f"Pipeline output complete: doc{doc_index}")
except Exception:
logger.exception(f"Pipeline output failed: doc{doc_index}")
raise
with ThreadPoolExecutor(max_workers=1) as output_executor:
def on_doc_ready(doc_index, model_list, middle_json, ocr_enable):
logger.debug(
f"Pipeline doc ready: doc{doc_index} pages={len(middle_json['pdf_info'])} output_submitted=1"
)
future = output_executor.submit(run_output_task, doc_index, middle_json, model_list)
output_futures.append(future)
pipeline_doc_analyze_streaming(
pdf_bytes_list,
image_writer_list,
p_lang_list,
on_doc_ready,
parse_method=parse_method,
formula_enable=p_formula_enable,
table_enable=p_table_enable,
)
for future in output_futures:
future.result()
return
async def _async_process_vlm(
output_dir,
pdf_file_names,
pdf_bytes_list,
backend,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_md,
f_dump_middle_json,
f_dump_model_output,
f_dump_orig_pdf,
f_dump_content_list,
f_make_md_mode,
server_url=None,
**kwargs,
):
"""异步处理VLM后端逻辑"""
parse_method = "vlm"
f_draw_span_bbox = False
if not backend.endswith("client"):
server_url = None
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = await aio_vlm_doc_analyze(
pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url, **kwargs,
)
pdf_info = middle_json["pdf_info"]
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
def _process_vlm(
output_dir,
pdf_file_names,
pdf_bytes_list,
backend,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_md,
f_dump_middle_json,
f_dump_model_output,
f_dump_orig_pdf,
f_dump_content_list,
f_make_md_mode,
server_url=None,
**kwargs,
):
"""同步处理VLM后端逻辑"""
parse_method = "vlm"
f_draw_span_bbox = False
if not backend.endswith("client"):
server_url = None
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = vlm_doc_analyze(
pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url, **kwargs,
)
pdf_info = middle_json["pdf_info"]
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
def _process_hybrid(
output_dir,
pdf_file_names,
pdf_bytes_list,
h_lang_list,
parse_method,
inline_formula_enable,
backend,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_md,
f_dump_middle_json,
f_dump_model_output,
f_dump_orig_pdf,
f_dump_content_list,
f_make_md_mode,
server_url=None,
**kwargs,
):
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
"""同步处理hybrid后端逻辑"""
if not backend.endswith("client"):
server_url = None
for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result, _vlm_ocr_enable = hybrid_doc_analyze(
pdf_bytes,
image_writer=image_writer,
backend=backend,
parse_method=parse_method,
language=lang,
inline_formula_enable=inline_formula_enable,
server_url=server_url,
**kwargs,
)
pdf_info = middle_json["pdf_info"]
# f_draw_span_bbox = not _vlm_ocr_enable
f_draw_span_bbox = False
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
async def _async_process_hybrid(
output_dir,
pdf_file_names,
pdf_bytes_list,
h_lang_list,
parse_method,
inline_formula_enable,
backend,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_md,
f_dump_middle_json,
f_dump_model_output,
f_dump_orig_pdf,
f_dump_content_list,
f_make_md_mode,
server_url=None,
**kwargs,
):
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
"""异步处理hybrid后端逻辑"""
if not backend.endswith("client"):
server_url = None
for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze(
pdf_bytes,
image_writer=image_writer,
backend=backend,
parse_method=parse_method,
language=lang,
inline_formula_enable=inline_formula_enable,
server_url=server_url,
**kwargs,
)
pdf_info = middle_json["pdf_info"]
# f_draw_span_bbox = not _vlm_ocr_enable
f_draw_span_bbox = False
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
def _process_office_doc(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_output=True,
f_dump_orig_file=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
):
need_remove_index = []
for i, file_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[i]
file_suffix = guess_suffix_by_bytes(file_bytes)
if file_suffix in docx_suffixes:
need_remove_index.append(i)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"office")
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = office_docx_analyze(
file_bytes,
image_writer=image_writer,
)
f_draw_layout_bbox = False
f_draw_span_bbox = False
pdf_info = middle_json["pdf_info"]
_process_output(
pdf_info, file_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_file,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, process_mode="docx"
)
elif file_suffix in pptx_suffixes:
need_remove_index.append(i)
logger.warning(f"Currently, PPTX files are not supported: {pdf_file_name}")
elif file_suffix in xlsx_suffixes:
need_remove_index.append(i)
logger.warning(f"Currently, XLSX files are not supported: {pdf_file_name}")
return need_remove_index
def do_parse(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
p_lang_list: list[str],
backend="pipeline",
parse_method="auto",
formula_enable=True,
table_enable=True,
server_url=None,
f_draw_layout_bbox=True,
f_draw_span_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_output=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
start_page_id=0,
end_page_id=None,
**kwargs,
):
need_remove_index = _process_office_doc(
output_dir,
pdf_file_names=pdf_file_names,
pdf_bytes_list=pdf_bytes_list,
f_dump_md=f_dump_md,
f_dump_middle_json=f_dump_middle_json,
f_dump_model_output=f_dump_model_output,
f_dump_orig_file=f_dump_orig_pdf,
f_dump_content_list=f_dump_content_list,
f_make_md_mode=f_make_md_mode,
)
for index in sorted(need_remove_index, reverse=True):
del pdf_bytes_list[index]
del pdf_file_names[index]
del p_lang_list[index]
if not pdf_bytes_list:
logger.warning("No valid PDF or image files to process.")
return
# 预处理PDF字节数据
pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id)
if backend == "pipeline":
_process_pipeline(
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list,
parse_method, formula_enable, table_enable,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode
)
else:
if backend.startswith("vlm-"):
backend = backend[4:]
if backend == "vllm-async-engine":
raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
if backend == "auto-engine":
backend = get_vlm_engine(inference_engine='auto', is_async=False)
os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
_process_vlm(
output_dir, pdf_file_names, pdf_bytes_list, backend,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
server_url, **kwargs,
)
elif backend.startswith("hybrid-"):
backend = backend[7:]
if backend == "vllm-async-engine":
raise Exception(
"hybrid-vllm-async-engine backend is not supported in sync mode, please use hybrid-vllm-engine backend")
if backend == "auto-engine":
backend = get_vlm_engine(inference_engine='auto', is_async=False)
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
_process_hybrid(
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
server_url, **kwargs,
)
async def aio_do_parse(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
p_lang_list: list[str],
backend="pipeline",
parse_method="auto",
formula_enable=True,
table_enable=True,
server_url=None,
f_draw_layout_bbox=True,
f_draw_span_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_output=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
start_page_id=0,
end_page_id=None,
**kwargs,
):
need_remove_index = _process_office_doc(
output_dir,
pdf_file_names=pdf_file_names,
pdf_bytes_list=pdf_bytes_list,
f_dump_md=f_dump_md,
f_dump_middle_json=f_dump_middle_json,
f_dump_model_output=f_dump_model_output,
f_dump_orig_file=f_dump_orig_pdf,
f_dump_content_list=f_dump_content_list,
f_make_md_mode=f_make_md_mode,
)
for index in sorted(need_remove_index, reverse=True):
del pdf_bytes_list[index]
del pdf_file_names[index]
del p_lang_list[index]
if not pdf_bytes_list:
logger.warning("No valid PDF or image files to process.")
return
# 预处理PDF字节数据
pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id)
if backend == "pipeline":
# pipeline模式暂不支持异步使用同步处理方式
_process_pipeline(
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list,
parse_method, formula_enable, table_enable,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode
)
else:
if backend.startswith("vlm-"):
backend = backend[4:]
if backend == "vllm-engine":
raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
if backend == "auto-engine":
backend = get_vlm_engine(inference_engine='auto', is_async=True)
os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
await _async_process_vlm(
output_dir, pdf_file_names, pdf_bytes_list, backend,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
server_url, **kwargs,
)
elif backend.startswith("hybrid-"):
backend = backend[7:]
if backend == "vllm-engine":
raise Exception("hybrid-vllm-engine backend is not supported in async mode, please use hybrid-vllm-async-engine backend")
if backend == "auto-engine":
backend = get_vlm_engine(inference_engine='auto', is_async=True)
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
await _async_process_hybrid(
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
server_url, **kwargs,
)
if __name__ == "__main__":
# pdf_path = "../../demo/pdfs/demo3.pdf"
pdf_path = "C:/Users/zhaoxiaomeng/Downloads/4546d0e2-ba60-40a5-a17e-b68555cec741.pdf"
try:
do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"],
end_page_id=10,
backend='vlm-huggingface'
# backend = 'pipeline'
)
except Exception as e:
logger.exception(e)