Files
MinerU/s3pdf2md.py
2024-02-29 16:53:41 +08:00

46 lines
1.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import sys
from pathlib import Path
import click
import json
from loguru import logger
from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
from mkcontent import mk_nlp_markdown
from pdf2md import main
from pdf_parse_by_model import parse_pdf_by_model
@click.command()
@click.option("--pdf-file-path", help="s3上pdf文件的路径")
@click.option("--pdf-name", help="pdf name")
def main_shell(pdf_file_path: str, pdf_name: str):
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
samples = json.load(f)
for sample in samples:
pdf_file_path = sample['s3_path']
pdf_bin_file_profile = "outsider"
pdf_name = sample['pdf_name']
pdf_model_dir = f"s3://llm-pdf-text/eval_1k/layout_res/{pdf_name}"
pdf_model_profile = "langchao"
p = Path(pdf_file_path)
pdf_file_name = p.name # pdf文件名字含后缀
#pdf_model_dir = join_path(pdf_model_parent_dir, pdf_file_name)
main(
pdf_file_path,
pdf_bin_file_profile,
pdf_model_dir,
pdf_model_profile,
debug_mode=True,
)
if __name__ == "__main__":
main_shell()