diff --git a/demo/batch_demo.py b/demo/batch_demo.py index 3e181f19..7f3cf468 100644 --- a/demo/batch_demo.py +++ b/demo/batch_demo.py @@ -1,38 +1,23 @@ import os -import shutil -import tempfile from pathlib import Path - -import click -import fitz -from loguru import logger - -import magic_pdf.model as model_config from magic_pdf.data.batch_build_dataset import batch_build_dataset -from magic_pdf.data.data_reader_writer import FileBasedDataReader -from magic_pdf.data.dataset import Dataset -from magic_pdf.libs.version import __version__ -from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods -from magic_pdf.utils.office_to_pdf import convert_file_to_pdf +from magic_pdf.tools.common import batch_do_parse def batch(pdf_dir, output_dir, method, lang): - model_config.__use_inside_model__ = True - model_config.__model_mode__ = 'full' os.makedirs(output_dir, exist_ok=True) - doc_paths = [] for doc_path in Path(pdf_dir).glob('*'): if doc_path.suffix == '.pdf': doc_paths.append(doc_path) # build dataset with 2 workers - datasets = batch_build_dataset(doc_paths, 2, lang) + datasets = batch_build_dataset(doc_paths, 4, lang) - os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "10" # every 10 pages will be parsed in one batch - batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, True) + # os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200" # every 200 pages will be parsed in one batch + batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method) if __name__ == '__main__': - batch("batch_data", "output", "ocr", "en") + batch("pdfs", "output", "auto", "") diff --git a/demo/demo.py b/demo/demo.py index 2706d7d6..2a1377b6 100644 --- a/demo/demo.py +++ b/demo/demo.py @@ -7,18 +7,17 @@ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod # args -pdf_file_name = "demo1.pdf" # replace with the real pdf path -name_without_suff = pdf_file_name.split(".")[0] +__dir__ = os.path.dirname(os.path.abspath(__file__)) +pdf_file_name = os.path.join(__dir__, "pdfs", "demo1.pdf") # replace with the real pdf path +name_without_extension = os.path.basename(pdf_file_name).split('.')[0] # prepare env -local_image_dir, local_md_dir = "output/images", "output" +local_image_dir = os.path.join(__dir__, "output", name_without_extension, "images") +local_md_dir = os.path.join(__dir__, "output", name_without_extension) image_dir = str(os.path.basename(local_image_dir)) - os.makedirs(local_image_dir, exist_ok=True) -image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( - local_md_dir -) +image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) # read bytes reader1 = FileBasedDataReader("") @@ -41,32 +40,29 @@ else: ## pipeline pipe_result = infer_result.pipe_txt_mode(image_writer) -### draw model result on each page -infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) - ### get model inference result model_inference_result = infer_result.get_infer_res() ### draw layout result on each page -pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) +pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_extension}_layout.pdf")) ### draw spans result on each page -pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")) +pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_extension}_spans.pdf")) ### get markdown content md_content = pipe_result.get_markdown(image_dir) ### dump markdown -pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) +pipe_result.dump_md(md_writer, f"{name_without_extension}.md", image_dir) ### get content list content content_list_content = pipe_result.get_content_list(image_dir) ### dump content list -pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) +pipe_result.dump_content_list(md_writer, f"{name_without_extension}_content_list.json", image_dir) ### get middle json middle_json_content = pipe_result.get_middle_json() ### dump middle json -pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json') +pipe_result.dump_middle_json(md_writer, f'{name_without_extension}_middle.json') diff --git a/demo/demo1.pdf b/demo/demo1.pdf deleted file mode 100644 index c9405d62..00000000 Binary files a/demo/demo1.pdf and /dev/null differ diff --git a/demo/demo2.pdf b/demo/demo2.pdf deleted file mode 100644 index af924bf5..00000000 Binary files a/demo/demo2.pdf and /dev/null differ diff --git a/demo/batch_data/demo1.pdf b/demo/pdfs/demo1.pdf similarity index 100% rename from demo/batch_data/demo1.pdf rename to demo/pdfs/demo1.pdf diff --git a/demo/batch_data/demo2.pdf b/demo/pdfs/demo2.pdf similarity index 100% rename from demo/batch_data/demo2.pdf rename to demo/pdfs/demo2.pdf diff --git a/demo/pdfs/demo3.pdf b/demo/pdfs/demo3.pdf new file mode 100644 index 00000000..75f4e047 Binary files /dev/null and b/demo/pdfs/demo3.pdf differ diff --git a/demo/small_ocr.pdf b/demo/pdfs/small_ocr.pdf similarity index 100% rename from demo/small_ocr.pdf rename to demo/pdfs/small_ocr.pdf diff --git a/docs/README_Ubuntu_CUDA_Acceleration_en_US.md b/docs/README_Ubuntu_CUDA_Acceleration_en_US.md index b90ef6f2..bb500dde 100644 --- a/docs/README_Ubuntu_CUDA_Acceleration_en_US.md +++ b/docs/README_Ubuntu_CUDA_Acceleration_en_US.md @@ -92,7 +92,7 @@ You can find the `magic-pdf.json` file in your user directory. Download a sample file from the repository and test it. ```sh -wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf +wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf magic-pdf -p small_ocr.pdf -o ./output ``` diff --git a/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md b/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md index 9d22d52c..cae6b50a 100644 --- a/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md +++ b/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md @@ -91,7 +91,7 @@ pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple 从仓库中下载样本文件,并测试 ```bash -wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/small_ocr.pdf +wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/pdfs/small_ocr.pdf magic-pdf -p small_ocr.pdf -o ./output ``` diff --git a/docs/README_Windows_CUDA_Acceleration_en_US.md b/docs/README_Windows_CUDA_Acceleration_en_US.md index 811f73c0..539140e7 100644 --- a/docs/README_Windows_CUDA_Acceleration_en_US.md +++ b/docs/README_Windows_CUDA_Acceleration_en_US.md @@ -53,7 +53,7 @@ You can find the `magic-pdf.json` file in your 【user directory】 . Download a sample file from the repository and test it. ```powershell - wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf + wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf magic-pdf -p small_ocr.pdf -o ./output ``` diff --git a/docs/README_Windows_CUDA_Acceleration_zh_CN.md b/docs/README_Windows_CUDA_Acceleration_zh_CN.md index 73d6df70..a772e798 100644 --- a/docs/README_Windows_CUDA_Acceleration_zh_CN.md +++ b/docs/README_Windows_CUDA_Acceleration_zh_CN.md @@ -54,7 +54,7 @@ pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple 从仓库中下载样本文件,并测试 ```powershell - wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf + wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf magic-pdf -p small_ocr.pdf -o ./output ``` diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py index 6cfaf436..9eef2969 100755 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py @@ -3,21 +3,6 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals -import os -import sys -import numpy as np -# import paddle -import signal -import random - -__dir__ = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) - - -import copy -# from paddle.io import Dataset, DataLoader, BatchSampler, DistributedBatchSampler -# import paddle.distributed as dist - from .imaug import transform, create_operators diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py index 871c8fd9..e35b9a4b 100755 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py @@ -16,7 +16,6 @@ class TextSystem(object): if self.use_angle_cls: self.text_classifier = predict_cls.TextClassifier(args, **kwargs) - def get_rotate_crop_image(self, img, points): ''' img_height, img_width = img.shape[0:2] diff --git a/magic_pdf/tools/cli.py b/magic_pdf/tools/cli.py index 94def789..27c249df 100644 --- a/magic_pdf/tools/cli.py +++ b/magic_pdf/tools/cli.py @@ -90,8 +90,6 @@ without method specified, auto will be used by default.""", default=None, ) def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): - model_config.__use_inside_model__ = True - model_config.__model_mode__ = 'full' os.makedirs(output_dir, exist_ok=True) temp_dir = tempfile.mkdtemp() def read_fn(path: Path): diff --git a/magic_pdf/tools/common.py b/magic_pdf/tools/common.py index 8b70a21f..134c5df8 100644 --- a/magic_pdf/tools/common.py +++ b/magic_pdf/tools/common.py @@ -73,7 +73,7 @@ def _do_parse( pdf_bytes_or_dataset, model_list, parse_method, - debug_able, + debug_able=False, f_draw_span_bbox=True, f_draw_layout_bbox=True, f_dump_md=True, @@ -250,7 +250,7 @@ def do_parse( pdf_bytes_or_dataset, model_list, parse_method, - debug_able, + debug_able=False, f_draw_span_bbox=True, f_draw_layout_bbox=True, f_dump_md=True, @@ -291,7 +291,7 @@ def batch_do_parse( pdf_file_names: list[str], pdf_bytes_or_datasets: list[bytes | Dataset], parse_method, - debug_able, + debug_able=False, f_draw_span_bbox=True, f_draw_layout_bbox=True, f_dump_md=True,