mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
46 lines
1.3 KiB
Python
46 lines
1.3 KiB
Python
import os
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
import click
|
||
import json
|
||
from loguru import logger
|
||
|
||
from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
|
||
from mkcontent import mk_nlp_markdown
|
||
from pdf2md import main
|
||
from pdf_parse_by_model import parse_pdf_by_model
|
||
|
||
|
||
|
||
|
||
@click.command()
|
||
@click.option("--pdf-file-path", help="s3上pdf文件的路径")
|
||
@click.option("--pdf-name", help="pdf name")
|
||
def main_shell(pdf_file_path: str, pdf_name: str):
|
||
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
|
||
samples = json.load(f)
|
||
for sample in samples:
|
||
pdf_file_path = sample['s3_path']
|
||
pdf_bin_file_profile = "outsider"
|
||
pdf_name = sample['pdf_name']
|
||
pdf_model_dir = f"s3://llm-pdf-text/eval_1k/layout_res/{pdf_name}"
|
||
pdf_model_profile = "langchao"
|
||
|
||
p = Path(pdf_file_path)
|
||
pdf_file_name = p.name # pdf文件名字,含后缀
|
||
|
||
#pdf_model_dir = join_path(pdf_model_parent_dir, pdf_file_name)
|
||
|
||
main(
|
||
pdf_file_path,
|
||
pdf_bin_file_profile,
|
||
pdf_model_dir,
|
||
pdf_model_profile,
|
||
debug_mode=True,
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main_shell()
|