diff --git a/tests/unittest/pdfs/test.pdf b/tests/unittest/pdfs/test.pdf index 62d32720..009632f6 100644 Binary files a/tests/unittest/pdfs/test.pdf and b/tests/unittest/pdfs/test.pdf differ diff --git a/tests/unittest/test_e2e.py b/tests/unittest/test_e2e.py index 088d598a..32b69511 100644 --- a/tests/unittest/test_e2e.py +++ b/tests/unittest/test_e2e.py @@ -72,7 +72,7 @@ def test_pipeline_with_two_config(): res_json_path = ( Path(__file__).parent / "output" / "test" / "txt" / "test_content_list.json" ).as_posix() - assert_content(res_json_path) + assert_content(res_json_path, parse_method="txt") infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = ( pipeline_doc_analyze( pdf_bytes_list, @@ -93,7 +93,7 @@ def test_pipeline_with_two_config(): res_json_path = ( Path(__file__).parent / "output" / "test" / "ocr" / "test_content_list.json" ).as_posix() - assert_content(res_json_path) + assert_content(res_json_path, parse_method="ocr") def test_vlm_transformers_with_default_config(): @@ -164,7 +164,7 @@ def test_vlm_transformers_with_default_config(): res_json_path = ( Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json" ).as_posix() - assert_content(res_json_path) + assert_content(res_json_path, parse_method="vlm") def write_infer_result( @@ -238,7 +238,7 @@ def validate_html(html_content): return False -def assert_content(content_path): +def assert_content(content_path, parse_method="txt"): content_list = [] with open(content_path, "r", encoding="utf-8") as file: content_list = json.load(file) @@ -267,19 +267,15 @@ def assert_content(content_path): ) assert validate_html(content_dict["table_body"]) target_str_list = [ - "Linear Regression", + "Model", + "Testing", + "Error", + "Linear", + "Regression", "0.98740", "1321.2", - "2-order Polynomial", - "0.99906", - "26.4", - "3-order Polynomial", - "0.99913", - "101.2", - "4-order Polynomial", - "0.99914", - "94.1", - "Gray Prediction", + "Gray", + "Prediction", "0.00617", "687", ] @@ -287,8 +283,12 @@ def assert_content(content_path): for target_str in target_str_list: if target_str in content_dict["table_body"]: correct_count += 1 - - assert correct_count > 0.9 * len(target_str_list) + if parse_method == "txt" or parse_method == "ocr": + assert correct_count > 0.9 * len(target_str_list) + elif parse_method == "vlm": + assert correct_count > 0.7 * len(target_str_list) + else: + assert False # 公式校验,检测是否含有公式元素 case "equation": type_set.add("equation")