mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
实现文本拼PDF解析结果装标准格式
This commit is contained in:
@@ -23,17 +23,11 @@ from loguru import logger
|
||||
|
||||
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
||||
from magic_pdf.pdf_parse_for_train import parse_pdf_for_train
|
||||
from magic_pdf.spark.base import exception_handler, get_data_source
|
||||
from magic_pdf.train_utils.convert_to_train_format import convert_to_train_format
|
||||
from app.common.s3 import get_s3_config, get_s3_client
|
||||
|
||||
|
||||
def exception_handler(jso: dict, e):
|
||||
logger.exception(e)
|
||||
jso["need_drop"] = True
|
||||
jso["drop_reason"] = DropReason.Exception
|
||||
jso["exception"] = f"ERROR: {e}"
|
||||
return jso
|
||||
|
||||
|
||||
def get_data_type(jso: dict):
|
||||
data_type = jso.get("data_type")
|
||||
@@ -49,13 +43,6 @@ def get_bookid(jso: dict):
|
||||
return book_id
|
||||
|
||||
|
||||
def get_data_source(jso: dict):
|
||||
data_source = jso.get("data_source")
|
||||
if data_source is None:
|
||||
data_source = jso.get("file_source")
|
||||
return data_source
|
||||
|
||||
|
||||
def meta_scan(jso: dict, doc_layout_check=True) -> dict:
|
||||
s3_pdf_path = jso.get("file_location")
|
||||
s3_config = get_s3_config(s3_pdf_path)
|
||||
|
||||
37
magic_pdf/pipeline_txt.py
Normal file
37
magic_pdf/pipeline_txt.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
文本型pdf转化为统一清洗格式
|
||||
"""
|
||||
|
||||
|
||||
|
||||
from loguru import logger
|
||||
from magic_pdf.dict2md.mkcontent import mk_universal_format
|
||||
from magic_pdf.libs.commons import join_path
|
||||
from magic_pdf.libs.json_compressor import JsonCompressor
|
||||
from magic_pdf.spark.base import exception_handler, get_data_source
|
||||
|
||||
|
||||
def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
|
||||
|
||||
if debug_mode:
|
||||
pass
|
||||
else: # 如果debug没开,则检测是否有needdrop字段
|
||||
if jso.get("need_drop", False):
|
||||
book_name = join_path(get_data_source(jso), jso["file_id"])
|
||||
logger.info(f"book_name is:{book_name} need drop")
|
||||
jso["dropped"] = True
|
||||
return jso
|
||||
try:
|
||||
pdf_intermediate_dict = jso["pdf_intermediate_dict"]
|
||||
# 将 pdf_intermediate_dict 解压
|
||||
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
|
||||
standard_format = mk_universal_format(pdf_intermediate_dict)
|
||||
jso["content_list"] = standard_format
|
||||
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
|
||||
# 把无用的信息清空
|
||||
jso["doc_layout_result"] = ""
|
||||
jso["pdf_intermediate_dict"] = ""
|
||||
jso["pdf_meta"] = ""
|
||||
except Exception as e:
|
||||
jso = exception_handler(jso, e)
|
||||
return jso
|
||||
21
magic_pdf/spark/base.py
Normal file
21
magic_pdf/spark/base.py
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.drop_reason import DropReason
|
||||
|
||||
|
||||
def get_data_source(jso: dict):
|
||||
data_source = jso.get("data_source")
|
||||
if data_source is None:
|
||||
data_source = jso.get("file_source")
|
||||
return data_source
|
||||
|
||||
|
||||
def exception_handler(jso: dict, e):
|
||||
logger.exception(e)
|
||||
jso["need_drop"] = True
|
||||
jso["drop_reason"] = DropReason.Exception
|
||||
jso["exception"] = f"ERROR: {e}"
|
||||
return jso
|
||||
|
||||
Reference in New Issue
Block a user