mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Merge remote-tracking branch 'origin/master'
This commit is contained in:
7
magic-pdf.template.json
Normal file
7
magic-pdf.template.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"bucket_info":{
|
||||
"bucket-name-1":["ak", "sk", "endpoint"],
|
||||
"bucket-name-2":["ak", "sk", "endpoint"]
|
||||
},
|
||||
"temp-output-dir":"/tmp"
|
||||
}
|
||||
48
magic_pdf/cli/magicpdf.py
Normal file
48
magic_pdf/cli/magicpdf.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
这里实现2个click命令:
|
||||
第一个:
|
||||
接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
|
||||
1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
|
||||
2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
|
||||
3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
|
||||
4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
|
||||
|
||||
最后把以上步骤准备好的对象传入真正的解析API
|
||||
|
||||
第二个:
|
||||
接收1)pdf的本地路径。2)模型json文件(可选)。然后:
|
||||
1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
|
||||
2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
|
||||
3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
|
||||
|
||||
|
||||
效果:
|
||||
python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
|
||||
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
import click
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
@click.option('--json', type=str, help='输入一个S3路径')
|
||||
def json_command(json):
|
||||
# 这里处理json相关的逻辑
|
||||
print(f'处理JSON: {json}')
|
||||
|
||||
@cli.command()
|
||||
@click.option('--pdf', type=click.Path(exists=True), required=True, help='PDF文件的路径')
|
||||
@click.option('--model', type=click.Path(exists=True), help='模型的路径')
|
||||
def pdf_command(pdf, model):
|
||||
# 这里处理pdf和模型相关的逻辑
|
||||
print(f'处理PDF: {pdf}')
|
||||
print(f'加载模型: {model}')
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
||||
20
magic_pdf/io/AbsReaderWriter.py
Normal file
20
magic_pdf/io/AbsReaderWriter.py
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class AbsReaderWriter(ABC):
|
||||
"""
|
||||
同时支持二进制和文本读写的抽象类
|
||||
TODO
|
||||
"""
|
||||
@abstractmethod
|
||||
def read(self, path: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def write(self, path: str, content: str):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
0
magic_pdf/io/DiskReaderWriter.py
Normal file
0
magic_pdf/io/DiskReaderWriter.py
Normal file
18
magic_pdf/io/S3ReaderWriter.py
Normal file
18
magic_pdf/io/S3ReaderWriter.py
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
|
||||
from magic_pdf.io import AbsReaderWriter
|
||||
|
||||
|
||||
class DiskReaderWriter(AbsReaderWriter):
|
||||
def __init__(self, parent_path, encoding='utf-8'):
|
||||
self.path = parent_path
|
||||
self.encoding = encoding
|
||||
|
||||
def read(self):
|
||||
with open(self.path, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def write(self, data):
|
||||
with open(self.path, 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
0
magic_pdf/io/__init__.py
Normal file
0
magic_pdf/io/__init__.py
Normal file
@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
|
||||
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
|
||||
json_dump_path = "s3://llm-pdf-text/json_dump/"
|
||||
|
||||
s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/"
|
||||
s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # TODO 基础库不应该有这些存在的路径,应该在业务代码中定义
|
||||
|
||||
|
||||
def get_top_percent_list(num_list, percent):
|
||||
@@ -214,29 +214,6 @@ def get_img_s3_client(save_path:str, image_s3_config:str):
|
||||
|
||||
return img_s3_client
|
||||
|
||||
# def get_s3_object(path):
|
||||
# src_cli_config = Config(**{
|
||||
#
|
||||
# "connect_timeout": 60,
|
||||
# "read_timeout": 20,
|
||||
# "max_pool_connections": 500,
|
||||
# "s3": {
|
||||
# "addressing_style": "path",
|
||||
# },
|
||||
# "retries": {
|
||||
# "max_attempts": 3,
|
||||
# }
|
||||
# })
|
||||
# full_path = f"{bucket_name}/{bucket_prefix}/{path}"
|
||||
# try:
|
||||
# src_cli = boto3.session.Session().client("s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=endpoint, region_name='', config=src_cli_config)
|
||||
# res = src_cli.get_object(Bucket=bucket_name, Key=f"{bucket_prefix}/{path}")
|
||||
# file_content = res["Body"].read()
|
||||
# return file_content
|
||||
# except Exception as e:
|
||||
# logger.error(f"get_s3_object({full_path}) error: {e}")
|
||||
# return b''
|
||||
|
||||
if __name__=="__main__":
|
||||
s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
|
||||
s3_profile = "langchao"
|
||||
|
||||
14
magic_pdf/libs/config_reader.py
Normal file
14
magic_pdf/libs/config_reader.py
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
|
||||
"""
|
||||
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
|
||||
|
||||
"""
|
||||
|
||||
def get_s3_config(bucket_name: str):
|
||||
"""
|
||||
~/magic-pdf.json 读出来
|
||||
"""
|
||||
ak , sk, endpoint = "", "", ""
|
||||
# TODO 请实现这个函数
|
||||
return ak, sk, endpoint
|
||||
@@ -57,12 +57,13 @@ def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict:
|
||||
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
|
||||
standard_format = mk_universal_format(pdf_intermediate_dict)
|
||||
mm_content = mk_mm_markdown(standard_format)
|
||||
jso["content_list"] = mm_content
|
||||
jso["content"] = mm_content
|
||||
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
|
||||
# 把无用的信息清空
|
||||
jso["doc_layout_result"] = ""
|
||||
jso["pdf_intermediate_dict"] = ""
|
||||
jso["pdf_meta"] = ""
|
||||
to_del_keys = ["doc_layout_result", "pdf_intermediate_dict", "pdf_meta", "parsed_result"]
|
||||
for key in to_del_keys:
|
||||
if jso.get(key):
|
||||
del jso[key]
|
||||
except Exception as e:
|
||||
jso = exception_handler(jso, e)
|
||||
return jso
|
||||
@@ -6,7 +6,7 @@ import re
|
||||
import random
|
||||
from typing import List, Union
|
||||
try:
|
||||
from app.config import s3_buckets, s3_clusters, s3_users
|
||||
from app.config import s3_buckets, s3_clusters, s3_users # TODO delete 循环依赖
|
||||
from app.common.runtime import get_cluster_name
|
||||
except ImportError:
|
||||
from magic_pdf.config import s3_buckets, s3_clusters, get_cluster_name, s3_users
|
||||
|
||||
38
magic_pdf/spark/spark_api.py
Normal file
38
magic_pdf/spark/spark_api.py
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
"""
|
||||
用户输入:
|
||||
model数组,每个元素代表一个页面
|
||||
pdf在s3的路径
|
||||
截图保存的s3位置
|
||||
|
||||
然后:
|
||||
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
|
||||
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
|
||||
|
||||
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from magic_pdf.io import AbsReaderWriter
|
||||
|
||||
|
||||
def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
|
||||
"""
|
||||
解析文本类pdf
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
|
||||
"""
|
||||
解析ocr类pdf
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
|
||||
"""
|
||||
ocr和文本混合的pdf,全部解析出来
|
||||
"""
|
||||
pass
|
||||
Reference in New Issue
Block a user