diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml index 762e106c..996b2ba0 100644 --- a/.github/workflows/cli.yml +++ b/.github/workflows/cli.yml @@ -38,11 +38,11 @@ jobs: cd $GITHUB_WORKSPACE && coverage run cd $GITHUB_WORKSPACE && python tests/get_coverage.py - notify_to_feishu: - if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}} - needs: cli-test - runs-on: ubuntu-latest - steps: - - name: notify - run: | - curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} +# notify_to_feishu: +# if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}} +# needs: cli-test +# runs-on: ubuntu-latest +# steps: +# - name: notify +# run: | +# curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} diff --git a/tests/unittest/test_e2e.py b/tests/unittest/test_e2e.py index 19f3b436..85c408a1 100644 --- a/tests/unittest/test_e2e.py +++ b/tests/unittest/test_e2e.py @@ -96,74 +96,74 @@ def test_pipeline_with_two_config(): assert_content(res_json_path, parse_method="ocr") -def test_vlm_transformers_with_default_config(): - __dir__ = os.path.dirname(os.path.abspath(__file__)) - pdf_files_dir = os.path.join(__dir__, "pdfs") - output_dir = os.path.join(__dir__, "output") - pdf_suffixes = [".pdf"] - image_suffixes = [".png", ".jpeg", ".jpg"] - - doc_path_list = [] - for doc_path in Path(pdf_files_dir).glob("*"): - if doc_path.suffix in pdf_suffixes + image_suffixes: - doc_path_list.append(doc_path) - - # os.environ["MINERU_MODEL_SOURCE"] = "modelscope" - - pdf_file_names = [] - pdf_bytes_list = [] - p_lang_list = [] - for path in doc_path_list: - file_name = str(Path(path).stem) - pdf_bytes = read_fn(path) - pdf_file_names.append(file_name) - pdf_bytes_list.append(pdf_bytes) - p_lang_list.append("en") - - for idx, pdf_bytes in enumerate(pdf_bytes_list): - pdf_file_name = pdf_file_names[idx] - pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes) - local_image_dir, local_md_dir = prepare_env( - output_dir, pdf_file_name, parse_method="vlm" - ) - image_writer, md_writer = FileBasedDataWriter( - local_image_dir - ), FileBasedDataWriter(local_md_dir) - middle_json, infer_result = vlm_doc_analyze( - pdf_bytes, image_writer=image_writer, backend="transformers" - ) - - pdf_info = middle_json["pdf_info"] - - image_dir = str(os.path.basename(local_image_dir)) - - md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir) - md_writer.write_string( - f"{pdf_file_name}.md", - md_content_str, - ) - - content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir) - md_writer.write_string( - f"{pdf_file_name}_content_list.json", - json.dumps(content_list, ensure_ascii=False, indent=4), - ) - - md_writer.write_string( - f"{pdf_file_name}_middle.json", - json.dumps(middle_json, ensure_ascii=False, indent=4), - ) - - md_writer.write_string( - f"{pdf_file_name}_model.json", - json.dumps(infer_result, ensure_ascii=False, indent=4), - ) - - logger.info(f"local output dir is {local_md_dir}") - res_json_path = ( - Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json" - ).as_posix() - assert_content(res_json_path, parse_method="vlm") +# def test_vlm_transformers_with_default_config(): +# __dir__ = os.path.dirname(os.path.abspath(__file__)) +# pdf_files_dir = os.path.join(__dir__, "pdfs") +# output_dir = os.path.join(__dir__, "output") +# pdf_suffixes = [".pdf"] +# image_suffixes = [".png", ".jpeg", ".jpg"] +# +# doc_path_list = [] +# for doc_path in Path(pdf_files_dir).glob("*"): +# if doc_path.suffix in pdf_suffixes + image_suffixes: +# doc_path_list.append(doc_path) +# +# # os.environ["MINERU_MODEL_SOURCE"] = "modelscope" +# +# pdf_file_names = [] +# pdf_bytes_list = [] +# p_lang_list = [] +# for path in doc_path_list: +# file_name = str(Path(path).stem) +# pdf_bytes = read_fn(path) +# pdf_file_names.append(file_name) +# pdf_bytes_list.append(pdf_bytes) +# p_lang_list.append("en") +# +# for idx, pdf_bytes in enumerate(pdf_bytes_list): +# pdf_file_name = pdf_file_names[idx] +# pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes) +# local_image_dir, local_md_dir = prepare_env( +# output_dir, pdf_file_name, parse_method="vlm" +# ) +# image_writer, md_writer = FileBasedDataWriter( +# local_image_dir +# ), FileBasedDataWriter(local_md_dir) +# middle_json, infer_result = vlm_doc_analyze( +# pdf_bytes, image_writer=image_writer, backend="transformers" +# ) +# +# pdf_info = middle_json["pdf_info"] +# +# image_dir = str(os.path.basename(local_image_dir)) +# +# md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir) +# md_writer.write_string( +# f"{pdf_file_name}.md", +# md_content_str, +# ) +# +# content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir) +# md_writer.write_string( +# f"{pdf_file_name}_content_list.json", +# json.dumps(content_list, ensure_ascii=False, indent=4), +# ) +# +# md_writer.write_string( +# f"{pdf_file_name}_middle.json", +# json.dumps(middle_json, ensure_ascii=False, indent=4), +# ) +# +# md_writer.write_string( +# f"{pdf_file_name}_model.json", +# json.dumps(infer_result, ensure_ascii=False, indent=4), +# ) +# +# logger.info(f"local output dir is {local_md_dir}") +# res_json_path = ( +# Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json" +# ).as_posix() +# assert_content(res_json_path, parse_method="vlm") def write_infer_result(