mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Merge branch 'master' of https://github.com/myhloli/Magic-PDF
This commit is contained in:
@@ -2,17 +2,19 @@
|
||||
文本型pdf转化为统一清洗格式
|
||||
"""
|
||||
|
||||
|
||||
# TODO 移动到spark/目录下
|
||||
|
||||
from loguru import logger
|
||||
from magic_pdf.dict2md.mkcontent import mk_universal_format
|
||||
from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
|
||||
from magic_pdf.libs.commons import join_path
|
||||
from magic_pdf.libs.json_compressor import JsonCompressor
|
||||
from magic_pdf.spark.base import exception_handler, get_data_source
|
||||
|
||||
|
||||
def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
|
||||
|
||||
"""
|
||||
变成统一的标准格式
|
||||
"""
|
||||
if debug_mode:
|
||||
pass
|
||||
else: # 如果debug没开,则检测是否有needdrop字段
|
||||
@@ -35,3 +37,32 @@ def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
|
||||
except Exception as e:
|
||||
jso = exception_handler(jso, e)
|
||||
return jso
|
||||
|
||||
|
||||
def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict:
|
||||
"""
|
||||
变成多模态的markdown格式
|
||||
"""
|
||||
if debug_mode:
|
||||
pass
|
||||
else: # 如果debug没开,则检测是否有needdrop字段
|
||||
if jso.get("need_drop", False):
|
||||
book_name = join_path(get_data_source(jso), jso["file_id"])
|
||||
logger.info(f"book_name is:{book_name} need drop")
|
||||
jso["dropped"] = True
|
||||
return jso
|
||||
try:
|
||||
pdf_intermediate_dict = jso["pdf_intermediate_dict"]
|
||||
# 将 pdf_intermediate_dict 解压
|
||||
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
|
||||
standard_format = mk_universal_format(pdf_intermediate_dict)
|
||||
mm_content = mk_mm_markdown(standard_format)
|
||||
jso["content_list"] = mm_content
|
||||
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
|
||||
# 把无用的信息清空
|
||||
jso["doc_layout_result"] = ""
|
||||
jso["pdf_intermediate_dict"] = ""
|
||||
jso["pdf_meta"] = ""
|
||||
except Exception as e:
|
||||
jso = exception_handler(jso, e)
|
||||
return jso
|
||||
Reference in New Issue
Block a user