mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
16
.github/workflows/cli.yml
vendored
16
.github/workflows/cli.yml
vendored
@@ -38,11 +38,11 @@ jobs:
|
||||
cd $GITHUB_WORKSPACE && coverage run
|
||||
cd $GITHUB_WORKSPACE && python tests/get_coverage.py
|
||||
|
||||
notify_to_feishu:
|
||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}}
|
||||
needs: cli-test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: notify
|
||||
run: |
|
||||
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }}
|
||||
# notify_to_feishu:
|
||||
# if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}}
|
||||
# needs: cli-test
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: notify
|
||||
# run: |
|
||||
# curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }}
|
||||
|
||||
@@ -10,29 +10,11 @@ services:
|
||||
MINERU_MODEL_SOURCE: local
|
||||
entrypoint: mineru-openai-server
|
||||
command:
|
||||
# ==================== Engine Selection ====================
|
||||
# WARNING: Only ONE engine can be enabled at a time!
|
||||
# Choose 'vllm' OR 'lmdeploy' (uncomment one line below)
|
||||
--engine vllm
|
||||
# --engine lmdeploy
|
||||
|
||||
# ==================== vLLM Engine Parameters ====================
|
||||
# Uncomment if using --engine vllm
|
||||
--host 0.0.0.0
|
||||
--port 30000
|
||||
# Multi-GPU configuration (increase throughput)
|
||||
# --data-parallel-size 2
|
||||
# Single GPU memory optimization (reduce if VRAM insufficient)
|
||||
# --gpu-memory-utilization 0.5 # Try 0.4 or lower if issues persist
|
||||
|
||||
# ==================== LMDeploy Engine Parameters ====================
|
||||
# Uncomment if using --engine lmdeploy
|
||||
# --server-name 0.0.0.0
|
||||
# --server-port 30000
|
||||
# Multi-GPU configuration (increase throughput)
|
||||
# --dp 2
|
||||
# Single GPU memory optimization (reduce if VRAM insufficient)
|
||||
# --cache-max-entry-count 0.5 # Try 0.4 or lower if issues persist
|
||||
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
|
||||
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
@@ -58,21 +40,11 @@ services:
|
||||
MINERU_MODEL_SOURCE: local
|
||||
entrypoint: mineru-api
|
||||
command:
|
||||
# ==================== Server Configuration ====================
|
||||
--host 0.0.0.0
|
||||
--port 8000
|
||||
|
||||
# ==================== vLLM Engine Parameters ====================
|
||||
# Multi-GPU configuration
|
||||
# --data-parallel-size 2
|
||||
# Single GPU memory optimization
|
||||
# --gpu-memory-utilization 0.5 # Try 0.4 or lower if VRAM insufficient
|
||||
|
||||
# ==================== LMDeploy Engine Parameters ====================
|
||||
# Multi-GPU configuration
|
||||
# --dp 2
|
||||
# Single GPU memory optimization
|
||||
# --cache-max-entry-count 0.5 # Try 0.4 or lower if VRAM insufficient
|
||||
# parameters for vllm-engine
|
||||
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
|
||||
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
@@ -96,30 +68,14 @@ services:
|
||||
MINERU_MODEL_SOURCE: local
|
||||
entrypoint: mineru-gradio
|
||||
command:
|
||||
# ==================== Gradio Server Configuration ====================
|
||||
--server-name 0.0.0.0
|
||||
--server-port 7860
|
||||
|
||||
# ==================== Gradio Feature Settings ====================
|
||||
# --enable-api false # Disable API endpoint
|
||||
# --max-convert-pages 20 # Limit conversion page count
|
||||
|
||||
# ==================== Engine Selection ====================
|
||||
# WARNING: Only ONE engine can be enabled at a time!
|
||||
|
||||
# Option 1: vLLM Engine (recommended for most users)
|
||||
--enable-vllm-engine true
|
||||
# Multi-GPU configuration
|
||||
# --data-parallel-size 2
|
||||
# Single GPU memory optimization
|
||||
# --gpu-memory-utilization 0.5 # Try 0.4 or lower if VRAM insufficient
|
||||
|
||||
# Option 2: LMDeploy Engine
|
||||
# --enable-lmdeploy-engine true
|
||||
# Multi-GPU configuration
|
||||
# --dp 2
|
||||
# Single GPU memory optimization
|
||||
# --cache-max-entry-count 0.5 # Try 0.4 or lower if VRAM insufficient
|
||||
--enable-vllm-engine true # Enable the vllm engine for Gradio
|
||||
# --enable-api false # If you want to disable the API, set this to false
|
||||
# --max-convert-pages 20 # If you want to limit the number of pages for conversion, set this to a specific number
|
||||
# parameters for vllm-engine
|
||||
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
|
||||
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
|
||||
@@ -484,7 +484,7 @@ def merge_para_with_text_v2(para_block):
|
||||
for line in para_block['lines']:
|
||||
for span in line['spans']:
|
||||
span_type = span['type']
|
||||
if span['content']:
|
||||
if span.get("content", '').strip():
|
||||
if para_type == BlockType.PHONETIC and span_type == ContentTypeV2.SPAN_TEXT:
|
||||
span_type = ContentTypeV2.SPAN_PHONETIC
|
||||
if span_type == ContentType.INLINE_EQUATION:
|
||||
@@ -534,13 +534,12 @@ def union_make(pdf_info_dict: list,
|
||||
output_content.append(para_content)
|
||||
elif make_mode == MakeMode.CONTENT_LIST_V2:
|
||||
# https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
|
||||
page_contents = []
|
||||
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
|
||||
if not para_blocks:
|
||||
continue
|
||||
for para_block in para_blocks:
|
||||
para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
|
||||
page_contents.append(para_content)
|
||||
page_contents = []
|
||||
if para_blocks:
|
||||
for para_block in para_blocks:
|
||||
para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
|
||||
page_contents.append(para_content)
|
||||
output_content.append(page_contents)
|
||||
|
||||
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.6.7"
|
||||
__version__ = "2.6.8"
|
||||
|
||||
@@ -96,74 +96,74 @@ def test_pipeline_with_two_config():
|
||||
assert_content(res_json_path, parse_method="ocr")
|
||||
|
||||
|
||||
def test_vlm_transformers_with_default_config():
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
pdf_files_dir = os.path.join(__dir__, "pdfs")
|
||||
output_dir = os.path.join(__dir__, "output")
|
||||
pdf_suffixes = [".pdf"]
|
||||
image_suffixes = [".png", ".jpeg", ".jpg"]
|
||||
|
||||
doc_path_list = []
|
||||
for doc_path in Path(pdf_files_dir).glob("*"):
|
||||
if doc_path.suffix in pdf_suffixes + image_suffixes:
|
||||
doc_path_list.append(doc_path)
|
||||
|
||||
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
|
||||
|
||||
pdf_file_names = []
|
||||
pdf_bytes_list = []
|
||||
p_lang_list = []
|
||||
for path in doc_path_list:
|
||||
file_name = str(Path(path).stem)
|
||||
pdf_bytes = read_fn(path)
|
||||
pdf_file_names.append(file_name)
|
||||
pdf_bytes_list.append(pdf_bytes)
|
||||
p_lang_list.append("en")
|
||||
|
||||
for idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
pdf_file_name = pdf_file_names[idx]
|
||||
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
|
||||
local_image_dir, local_md_dir = prepare_env(
|
||||
output_dir, pdf_file_name, parse_method="vlm"
|
||||
)
|
||||
image_writer, md_writer = FileBasedDataWriter(
|
||||
local_image_dir
|
||||
), FileBasedDataWriter(local_md_dir)
|
||||
middle_json, infer_result = vlm_doc_analyze(
|
||||
pdf_bytes, image_writer=image_writer, backend="transformers"
|
||||
)
|
||||
|
||||
pdf_info = middle_json["pdf_info"]
|
||||
|
||||
image_dir = str(os.path.basename(local_image_dir))
|
||||
|
||||
md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}.md",
|
||||
md_content_str,
|
||||
)
|
||||
|
||||
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_content_list.json",
|
||||
json.dumps(content_list, ensure_ascii=False, indent=4),
|
||||
)
|
||||
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_middle.json",
|
||||
json.dumps(middle_json, ensure_ascii=False, indent=4),
|
||||
)
|
||||
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_model.json",
|
||||
json.dumps(infer_result, ensure_ascii=False, indent=4),
|
||||
)
|
||||
|
||||
logger.info(f"local output dir is {local_md_dir}")
|
||||
res_json_path = (
|
||||
Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json"
|
||||
).as_posix()
|
||||
assert_content(res_json_path, parse_method="vlm")
|
||||
# def test_vlm_transformers_with_default_config():
|
||||
# __dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
# pdf_files_dir = os.path.join(__dir__, "pdfs")
|
||||
# output_dir = os.path.join(__dir__, "output")
|
||||
# pdf_suffixes = [".pdf"]
|
||||
# image_suffixes = [".png", ".jpeg", ".jpg"]
|
||||
#
|
||||
# doc_path_list = []
|
||||
# for doc_path in Path(pdf_files_dir).glob("*"):
|
||||
# if doc_path.suffix in pdf_suffixes + image_suffixes:
|
||||
# doc_path_list.append(doc_path)
|
||||
#
|
||||
# # os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
|
||||
#
|
||||
# pdf_file_names = []
|
||||
# pdf_bytes_list = []
|
||||
# p_lang_list = []
|
||||
# for path in doc_path_list:
|
||||
# file_name = str(Path(path).stem)
|
||||
# pdf_bytes = read_fn(path)
|
||||
# pdf_file_names.append(file_name)
|
||||
# pdf_bytes_list.append(pdf_bytes)
|
||||
# p_lang_list.append("en")
|
||||
#
|
||||
# for idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
# pdf_file_name = pdf_file_names[idx]
|
||||
# pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
|
||||
# local_image_dir, local_md_dir = prepare_env(
|
||||
# output_dir, pdf_file_name, parse_method="vlm"
|
||||
# )
|
||||
# image_writer, md_writer = FileBasedDataWriter(
|
||||
# local_image_dir
|
||||
# ), FileBasedDataWriter(local_md_dir)
|
||||
# middle_json, infer_result = vlm_doc_analyze(
|
||||
# pdf_bytes, image_writer=image_writer, backend="transformers"
|
||||
# )
|
||||
#
|
||||
# pdf_info = middle_json["pdf_info"]
|
||||
#
|
||||
# image_dir = str(os.path.basename(local_image_dir))
|
||||
#
|
||||
# md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir)
|
||||
# md_writer.write_string(
|
||||
# f"{pdf_file_name}.md",
|
||||
# md_content_str,
|
||||
# )
|
||||
#
|
||||
# content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
|
||||
# md_writer.write_string(
|
||||
# f"{pdf_file_name}_content_list.json",
|
||||
# json.dumps(content_list, ensure_ascii=False, indent=4),
|
||||
# )
|
||||
#
|
||||
# md_writer.write_string(
|
||||
# f"{pdf_file_name}_middle.json",
|
||||
# json.dumps(middle_json, ensure_ascii=False, indent=4),
|
||||
# )
|
||||
#
|
||||
# md_writer.write_string(
|
||||
# f"{pdf_file_name}_model.json",
|
||||
# json.dumps(infer_result, ensure_ascii=False, indent=4),
|
||||
# )
|
||||
#
|
||||
# logger.info(f"local output dir is {local_md_dir}")
|
||||
# res_json_path = (
|
||||
# Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json"
|
||||
# ).as_posix()
|
||||
# assert_content(res_json_path, parse_method="vlm")
|
||||
|
||||
|
||||
def write_infer_result(
|
||||
@@ -241,6 +241,7 @@ def assert_content(content_path, parse_method="txt"):
|
||||
content_list = []
|
||||
with open(content_path, "r", encoding="utf-8") as file:
|
||||
content_list = json.load(file)
|
||||
logger.info(content_list)
|
||||
type_set = set()
|
||||
for content_dict in content_list:
|
||||
match content_dict["type"]:
|
||||
|
||||
Reference in New Issue
Block a user