From b7ef8f7e056cbf811571f2e770104d1eb645989d Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 12 Dec 2025 17:37:55 +0800 Subject: [PATCH 1/6] fix: add logging for content list in test_e2e.py --- tests/unittest/test_e2e.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unittest/test_e2e.py b/tests/unittest/test_e2e.py index d50e69a2..19f3b436 100644 --- a/tests/unittest/test_e2e.py +++ b/tests/unittest/test_e2e.py @@ -241,6 +241,7 @@ def assert_content(content_path, parse_method="txt"): content_list = [] with open(content_path, "r", encoding="utf-8") as file: content_list = json.load(file) + logger.info(content_list) type_set = set() for content_dict in content_list: match content_dict["type"]: From 63e45db40e772cc855ff529642d38c1e6785304d Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 12 Dec 2025 18:17:12 +0800 Subject: [PATCH 2/6] comment out test_vlm_transformers_with_default_config function in test_e2e.py --- tests/unittest/test_e2e.py | 136 ++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/tests/unittest/test_e2e.py b/tests/unittest/test_e2e.py index 19f3b436..85c408a1 100644 --- a/tests/unittest/test_e2e.py +++ b/tests/unittest/test_e2e.py @@ -96,74 +96,74 @@ def test_pipeline_with_two_config(): assert_content(res_json_path, parse_method="ocr") -def test_vlm_transformers_with_default_config(): - __dir__ = os.path.dirname(os.path.abspath(__file__)) - pdf_files_dir = os.path.join(__dir__, "pdfs") - output_dir = os.path.join(__dir__, "output") - pdf_suffixes = [".pdf"] - image_suffixes = [".png", ".jpeg", ".jpg"] - - doc_path_list = [] - for doc_path in Path(pdf_files_dir).glob("*"): - if doc_path.suffix in pdf_suffixes + image_suffixes: - doc_path_list.append(doc_path) - - # os.environ["MINERU_MODEL_SOURCE"] = "modelscope" - - pdf_file_names = [] - pdf_bytes_list = [] - p_lang_list = [] - for path in doc_path_list: - file_name = str(Path(path).stem) - pdf_bytes = read_fn(path) - pdf_file_names.append(file_name) - pdf_bytes_list.append(pdf_bytes) - p_lang_list.append("en") - - for idx, pdf_bytes in enumerate(pdf_bytes_list): - pdf_file_name = pdf_file_names[idx] - pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes) - local_image_dir, local_md_dir = prepare_env( - output_dir, pdf_file_name, parse_method="vlm" - ) - image_writer, md_writer = FileBasedDataWriter( - local_image_dir - ), FileBasedDataWriter(local_md_dir) - middle_json, infer_result = vlm_doc_analyze( - pdf_bytes, image_writer=image_writer, backend="transformers" - ) - - pdf_info = middle_json["pdf_info"] - - image_dir = str(os.path.basename(local_image_dir)) - - md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir) - md_writer.write_string( - f"{pdf_file_name}.md", - md_content_str, - ) - - content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir) - md_writer.write_string( - f"{pdf_file_name}_content_list.json", - json.dumps(content_list, ensure_ascii=False, indent=4), - ) - - md_writer.write_string( - f"{pdf_file_name}_middle.json", - json.dumps(middle_json, ensure_ascii=False, indent=4), - ) - - md_writer.write_string( - f"{pdf_file_name}_model.json", - json.dumps(infer_result, ensure_ascii=False, indent=4), - ) - - logger.info(f"local output dir is {local_md_dir}") - res_json_path = ( - Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json" - ).as_posix() - assert_content(res_json_path, parse_method="vlm") +# def test_vlm_transformers_with_default_config(): +# __dir__ = os.path.dirname(os.path.abspath(__file__)) +# pdf_files_dir = os.path.join(__dir__, "pdfs") +# output_dir = os.path.join(__dir__, "output") +# pdf_suffixes = [".pdf"] +# image_suffixes = [".png", ".jpeg", ".jpg"] +# +# doc_path_list = [] +# for doc_path in Path(pdf_files_dir).glob("*"): +# if doc_path.suffix in pdf_suffixes + image_suffixes: +# doc_path_list.append(doc_path) +# +# # os.environ["MINERU_MODEL_SOURCE"] = "modelscope" +# +# pdf_file_names = [] +# pdf_bytes_list = [] +# p_lang_list = [] +# for path in doc_path_list: +# file_name = str(Path(path).stem) +# pdf_bytes = read_fn(path) +# pdf_file_names.append(file_name) +# pdf_bytes_list.append(pdf_bytes) +# p_lang_list.append("en") +# +# for idx, pdf_bytes in enumerate(pdf_bytes_list): +# pdf_file_name = pdf_file_names[idx] +# pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes) +# local_image_dir, local_md_dir = prepare_env( +# output_dir, pdf_file_name, parse_method="vlm" +# ) +# image_writer, md_writer = FileBasedDataWriter( +# local_image_dir +# ), FileBasedDataWriter(local_md_dir) +# middle_json, infer_result = vlm_doc_analyze( +# pdf_bytes, image_writer=image_writer, backend="transformers" +# ) +# +# pdf_info = middle_json["pdf_info"] +# +# image_dir = str(os.path.basename(local_image_dir)) +# +# md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir) +# md_writer.write_string( +# f"{pdf_file_name}.md", +# md_content_str, +# ) +# +# content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir) +# md_writer.write_string( +# f"{pdf_file_name}_content_list.json", +# json.dumps(content_list, ensure_ascii=False, indent=4), +# ) +# +# md_writer.write_string( +# f"{pdf_file_name}_middle.json", +# json.dumps(middle_json, ensure_ascii=False, indent=4), +# ) +# +# md_writer.write_string( +# f"{pdf_file_name}_model.json", +# json.dumps(infer_result, ensure_ascii=False, indent=4), +# ) +# +# logger.info(f"local output dir is {local_md_dir}") +# res_json_path = ( +# Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json" +# ).as_posix() +# assert_content(res_json_path, parse_method="vlm") def write_infer_result( From a2a25200bc35f49d18b4279d79e96ac552468543 Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 12 Dec 2025 18:18:42 +0800 Subject: [PATCH 3/6] fix: comment out notify_to_feishu step in cli.yml --- .github/workflows/cli.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml index 762e106c..996b2ba0 100644 --- a/.github/workflows/cli.yml +++ b/.github/workflows/cli.yml @@ -38,11 +38,11 @@ jobs: cd $GITHUB_WORKSPACE && coverage run cd $GITHUB_WORKSPACE && python tests/get_coverage.py - notify_to_feishu: - if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}} - needs: cli-test - runs-on: ubuntu-latest - steps: - - name: notify - run: | - curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} +# notify_to_feishu: +# if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}} +# needs: cli-test +# runs-on: ubuntu-latest +# steps: +# - name: notify +# run: | +# curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} From 5f86767faebfec01474d909572da2b1298f01807 Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 12 Dec 2025 20:21:59 +0800 Subject: [PATCH 4/6] refactor: update comments in compose.yaml for clarity and guidance on engine parameters --- docker/compose.yaml | 66 ++++++++------------------------------------- 1 file changed, 11 insertions(+), 55 deletions(-) diff --git a/docker/compose.yaml b/docker/compose.yaml index ccaa3a1f..abd4b7d5 100644 --- a/docker/compose.yaml +++ b/docker/compose.yaml @@ -10,29 +10,11 @@ services: MINERU_MODEL_SOURCE: local entrypoint: mineru-openai-server command: - # ==================== Engine Selection ==================== - # WARNING: Only ONE engine can be enabled at a time! - # Choose 'vllm' OR 'lmdeploy' (uncomment one line below) --engine vllm - # --engine lmdeploy - - # ==================== vLLM Engine Parameters ==================== - # Uncomment if using --engine vllm --host 0.0.0.0 --port 30000 - # Multi-GPU configuration (increase throughput) - # --data-parallel-size 2 - # Single GPU memory optimization (reduce if VRAM insufficient) - # --gpu-memory-utilization 0.5 # Try 0.4 or lower if issues persist - - # ==================== LMDeploy Engine Parameters ==================== - # Uncomment if using --engine lmdeploy - # --server-name 0.0.0.0 - # --server-port 30000 - # Multi-GPU configuration (increase throughput) - # --dp 2 - # Single GPU memory optimization (reduce if VRAM insufficient) - # --cache-max-entry-count 0.5 # Try 0.4 or lower if issues persist + # --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode + # --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below. ulimits: memlock: -1 stack: 67108864 @@ -58,21 +40,11 @@ services: MINERU_MODEL_SOURCE: local entrypoint: mineru-api command: - # ==================== Server Configuration ==================== --host 0.0.0.0 --port 8000 - - # ==================== vLLM Engine Parameters ==================== - # Multi-GPU configuration - # --data-parallel-size 2 - # Single GPU memory optimization - # --gpu-memory-utilization 0.5 # Try 0.4 or lower if VRAM insufficient - - # ==================== LMDeploy Engine Parameters ==================== - # Multi-GPU configuration - # --dp 2 - # Single GPU memory optimization - # --cache-max-entry-count 0.5 # Try 0.4 or lower if VRAM insufficient + # parameters for vllm-engine + # --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode + # --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below. ulimits: memlock: -1 stack: 67108864 @@ -96,30 +68,14 @@ services: MINERU_MODEL_SOURCE: local entrypoint: mineru-gradio command: - # ==================== Gradio Server Configuration ==================== --server-name 0.0.0.0 --server-port 7860 - - # ==================== Gradio Feature Settings ==================== - # --enable-api false # Disable API endpoint - # --max-convert-pages 20 # Limit conversion page count - - # ==================== Engine Selection ==================== - # WARNING: Only ONE engine can be enabled at a time! - - # Option 1: vLLM Engine (recommended for most users) - --enable-vllm-engine true - # Multi-GPU configuration - # --data-parallel-size 2 - # Single GPU memory optimization - # --gpu-memory-utilization 0.5 # Try 0.4 or lower if VRAM insufficient - - # Option 2: LMDeploy Engine - # --enable-lmdeploy-engine true - # Multi-GPU configuration - # --dp 2 - # Single GPU memory optimization - # --cache-max-entry-count 0.5 # Try 0.4 or lower if VRAM insufficient + --enable-vllm-engine true # Enable the vllm engine for Gradio + # --enable-api false # If you want to disable the API, set this to false + # --max-convert-pages 20 # If you want to limit the number of pages for conversion, set this to a specific number + # parameters for vllm-engine + # --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode + # --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below. ulimits: memlock: -1 stack: 67108864 From 107e17722337b0f5076fdf0faad651bebf8dd0aa Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 15 Dec 2025 17:58:11 +0800 Subject: [PATCH 5/6] fix: improve content check and streamline content list generation in vlm_middle_json_mkcontent.py --- mineru/backend/vlm/vlm_middle_json_mkcontent.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mineru/backend/vlm/vlm_middle_json_mkcontent.py b/mineru/backend/vlm/vlm_middle_json_mkcontent.py index 196d6bd0..f05a38b5 100644 --- a/mineru/backend/vlm/vlm_middle_json_mkcontent.py +++ b/mineru/backend/vlm/vlm_middle_json_mkcontent.py @@ -484,7 +484,7 @@ def merge_para_with_text_v2(para_block): for line in para_block['lines']: for span in line['spans']: span_type = span['type'] - if span['content']: + if span.get("content", '').strip(): if para_type == BlockType.PHONETIC and span_type == ContentTypeV2.SPAN_TEXT: span_type = ContentTypeV2.SPAN_PHONETIC if span_type == ContentType.INLINE_EQUATION: @@ -534,13 +534,12 @@ def union_make(pdf_info_dict: list, output_content.append(para_content) elif make_mode == MakeMode.CONTENT_LIST_V2: # https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md - page_contents = [] para_blocks = (paras_of_layout or []) + (paras_of_discarded or []) - if not para_blocks: - continue - for para_block in para_blocks: - para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size) - page_contents.append(para_content) + page_contents = [] + if para_blocks: + for para_block in para_blocks: + para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size) + page_contents.append(para_content) output_content.append(page_contents) if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: From eed479eb56bba93ee99c1a8c255d509bd2f837e5 Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 15 Dec 2025 10:22:19 +0000 Subject: [PATCH 6/6] Update version.py with new version --- mineru/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mineru/version.py b/mineru/version.py index 492f7d9a..763cefde 100644 --- a/mineru/version.py +++ b/mineru/version.py @@ -1 +1 @@ -__version__ = "2.6.7" +__version__ = "2.6.8"