diff --git a/magic_pdf/pipeline.py b/magic_pdf/pipeline.py index c33ef547..d842c929 100644 --- a/magic_pdf/pipeline.py +++ b/magic_pdf/pipeline.py @@ -23,17 +23,11 @@ from loguru import logger from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr from magic_pdf.pdf_parse_for_train import parse_pdf_for_train +from magic_pdf.spark.base import exception_handler, get_data_source from magic_pdf.train_utils.convert_to_train_format import convert_to_train_format from app.common.s3 import get_s3_config, get_s3_client -def exception_handler(jso: dict, e): - logger.exception(e) - jso["need_drop"] = True - jso["drop_reason"] = DropReason.Exception - jso["exception"] = f"ERROR: {e}" - return jso - def get_data_type(jso: dict): data_type = jso.get("data_type") @@ -49,13 +43,6 @@ def get_bookid(jso: dict): return book_id -def get_data_source(jso: dict): - data_source = jso.get("data_source") - if data_source is None: - data_source = jso.get("file_source") - return data_source - - def meta_scan(jso: dict, doc_layout_check=True) -> dict: s3_pdf_path = jso.get("file_location") s3_config = get_s3_config(s3_pdf_path) diff --git a/magic_pdf/pipeline_txt.py b/magic_pdf/pipeline_txt.py new file mode 100644 index 00000000..8d147c81 --- /dev/null +++ b/magic_pdf/pipeline_txt.py @@ -0,0 +1,37 @@ +""" +文本型pdf转化为统一清洗格式 +""" + + + +from loguru import logger +from magic_pdf.dict2md.mkcontent import mk_universal_format +from magic_pdf.libs.commons import join_path +from magic_pdf.libs.json_compressor import JsonCompressor +from magic_pdf.spark.base import exception_handler, get_data_source + + +def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict: + + if debug_mode: + pass + else: # 如果debug没开,则检测是否有needdrop字段 + if jso.get("need_drop", False): + book_name = join_path(get_data_source(jso), jso["file_id"]) + logger.info(f"book_name is:{book_name} need drop") + jso["dropped"] = True + return jso + try: + pdf_intermediate_dict = jso["pdf_intermediate_dict"] + # 将 pdf_intermediate_dict 解压 + pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) + standard_format = mk_universal_format(pdf_intermediate_dict) + jso["content_list"] = standard_format + logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",) + # 把无用的信息清空 + jso["doc_layout_result"] = "" + jso["pdf_intermediate_dict"] = "" + jso["pdf_meta"] = "" + except Exception as e: + jso = exception_handler(jso, e) + return jso diff --git a/magic_pdf/spark/base.py b/magic_pdf/spark/base.py new file mode 100644 index 00000000..c08d1a9f --- /dev/null +++ b/magic_pdf/spark/base.py @@ -0,0 +1,21 @@ + + +from loguru import logger + +from magic_pdf.libs.drop_reason import DropReason + + +def get_data_source(jso: dict): + data_source = jso.get("data_source") + if data_source is None: + data_source = jso.get("file_source") + return data_source + + +def exception_handler(jso: dict, e): + logger.exception(e) + jso["need_drop"] = True + jso["drop_reason"] = DropReason.Exception + jso["exception"] = f"ERROR: {e}" + return jso +