From e4ed6023e4f480f92eef19dd985b6a23fdd8885f Mon Sep 17 00:00:00 2001 From: xu rui Date: Tue, 3 Dec 2024 14:06:42 +0800 Subject: [PATCH] docs: add [en|zh_cn] docs --- next_docs/en/user_guide/tutorial.rst | 4 +- next_docs/en/user_guide/tutorial/pipeline.rst | 185 ++++++++++++++++++ .../zh_cn/user_guide/tutorial/pipeline.rst | 16 +- 3 files changed, 198 insertions(+), 7 deletions(-) create mode 100644 next_docs/en/user_guide/tutorial/pipeline.rst diff --git a/next_docs/en/user_guide/tutorial.rst b/next_docs/en/user_guide/tutorial.rst index 20764701..eaed04fd 100644 --- a/next_docs/en/user_guide/tutorial.rst +++ b/next_docs/en/user_guide/tutorial.rst @@ -7,4 +7,6 @@ From the beginning to the end, Show how to using mineru via a minimal project .. toctree:: :maxdepth: 1 - tutorial/output_file_description \ No newline at end of file + tutorial/output_file_description + tutorial/pipeline + diff --git a/next_docs/en/user_guide/tutorial/pipeline.rst b/next_docs/en/user_guide/tutorial/pipeline.rst new file mode 100644 index 00000000..8e73a3f5 --- /dev/null +++ b/next_docs/en/user_guide/tutorial/pipeline.rst @@ -0,0 +1,185 @@ + + +Pipeline +========== + + +Minimal Example +^^^^^^^^^^^^^^^^^ + +.. code:: python + + import os + + from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader + from magic_pdf.data.dataset import PymuDocDataset + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + + # args + pdf_file_name = "abc.pdf" # replace with the real pdf path + name_without_suff = pdf_file_name.split(".")[0] + + # prepare env + local_image_dir, local_md_dir = "output/images", "output" + image_dir = str(os.path.basename(local_image_dir)) + + os.makedirs(local_image_dir, exist_ok=True) + + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( + local_md_dir + ) + image_dir = str(os.path.basename(local_image_dir)) + + # read bytes + reader1 = FileBasedDataReader("") + pdf_bytes = reader1.read(pdf_file_name) # read the pdf content + + # proc + ## Create Dataset Instance + ds = PymuDocDataset(pdf_bytes) + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir) + +Running the above code will result in the following + + +.. code:: bash + + output/ + ├── abc.md + └── images + + +Excluding the setup of the environment, such as creating directories and importing dependencies, the actual code snippet for converting pdf to markdown is as follows + + +.. code:: python + + # read bytes + reader1 = FileBasedDataReader("") + pdf_bytes = reader1.read(pdf_file_name) # read the pdf content + + # proc + ## Create Dataset Instance + ds = PymuDocDataset(pdf_bytes) + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir) + +``ds.apply(doc_analyze, ocr=True)`` generates an ``InferenceResult`` object. The ``InferenceResult`` object, when executing the ``pipe_ocr_mode`` method, produces a ``PipeResult`` object. +The ``PipeResult`` object, upon executing ``dump_md``, generates a ``markdown`` file at the specified location. + + +The pipeline execution process is illustrated in the following diagram + + +.. image:: ../../_static/image/pipeline.drawio.svg + +.. raw:: html + +

+ +Currently, the process is divided into three stages: data, inference, and processing, which correspond to the ``Dataset``, ``InferenceResult``, and ``PipeResult`` entities in the diagram. +These stages are linked together through methods like ``apply``, ``doc_analyze``, or ``pipe_ocr_mode`` + + +.. admonition:: Tip + :class: tip + + For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown` + + For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators` + + +Pipeline Composition +^^^^^^^^^^^^^^^^^^^^^ + +.. code:: python + + class Dataset(ABC): + @abstractmethod + def apply(self, proc: Callable, *args, **kwargs): + """Apply callable method which. + + Args: + proc (Callable): invoke proc as follows: + proc(self, *args, **kwargs) + + Returns: + Any: return the result generated by proc + """ + pass + + class InferenceResult(InferenceResultBase): + + def apply(self, proc: Callable, *args, **kwargs): + """Apply callable method which. + + Args: + proc (Callable): invoke proc as follows: + proc(inference_result, *args, **kwargs) + + Returns: + Any: return the result generated by proc + """ + return proc(copy.deepcopy(self._infer_res), *args, **kwargs) + + def pipe_ocr_mode( + self, + imageWriter: DataWriter, + start_page_id=0, + end_page_id=None, + debug_mode=False, + lang=None, + ) -> PipeResult: + pass + + class PipeResult: + def apply(self, proc: Callable, *args, **kwargs): + """Apply callable method which. + + Args: + proc (Callable): invoke proc as follows: + proc(pipeline_result, *args, **kwargs) + + Returns: + Any: return the result generated by proc + """ + return proc(copy.deepcopy(self._pipe_res), *args, **kwargs) + + +The ``Dataset``, ``InferenceResult``, and ``PipeResult`` classes all have an ``apply`` method, which can be used to chain different stages of the computation. +As shown below, ``MinerU`` provides a set of methods to compose these classes. + + +.. code:: python + + # proc + ## Create Dataset Instance + ds = PymuDocDataset(pdf_bytes) + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir) + + +Users can implement their own functions for chaining as needed. For example, a user could use the ``apply`` method to create a function that counts the number of pages in a ``pdf`` file. + + +.. code:: python + + from magic_pdf.data.data_reader_writer import FileBasedDataReader + from magic_pdf.data.dataset import PymuDocDataset + + # args + pdf_file_name = "abc.pdf" # replace with the real pdf path + + # read bytes + reader1 = FileBasedDataReader("") + pdf_bytes = reader1.read(pdf_file_name) # read the pdf content + + # proc + ## Create Dataset Instance + ds = PymuDocDataset(pdf_bytes) + + def count_page(ds)-> int: + return len(ds) + + print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf` diff --git a/next_docs/zh_cn/user_guide/tutorial/pipeline.rst b/next_docs/zh_cn/user_guide/tutorial/pipeline.rst index 9f301368..c8adaa3e 100644 --- a/next_docs/zh_cn/user_guide/tutorial/pipeline.rst +++ b/next_docs/zh_cn/user_guide/tutorial/pipeline.rst @@ -49,7 +49,7 @@ └── images -除去初始化环境,如建立目录、导入依赖库等逻辑。真正执行将 `pdf` 转换为 `markdown` 的代码片段如下 +除去初始化环境,如建立目录、导入依赖库等逻辑。真正将 ``pdf`` 转换为 ``markdown`` 的代码片段如下 .. code:: @@ -64,15 +64,19 @@ ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir) -`ds.apply(doc_analyze, ocr=True)` 会生成 `InferenceResult` 对象。 `InferenceResult` 的对象执行 `pipe_ocr_mode` 方法会生成 `PipeResult` 对象。 -`PipeResult` 对象执行 `dump_md` 会在指定位置生成 `markdown` 文件。 +``ds.apply(doc_analyze, ocr=True)`` 会生成 ``InferenceResult`` 对象。 ``InferenceResult`` 对象执行 ``pipe_ocr_mode`` 方法会生成 ``PipeResult`` 对象。 +``PipeResult`` 对象执行 ``dump_md`` 会在指定位置生成 ``markdown`` 文件。 pipeline 的执行过程如下图所示 .. image:: ../../_static/image/pipeline.drawio.svg +.. raw:: html +

+ +目前划分出数据、推理、程序处理三个阶段,分别对应着图上的 ``Dataset``, ``InferenceResult``, ``PipeResult`` 这三个实体。通过 ``apply`` , ``doc_analyze`` 或 ``pipe_ocr_mode`` 等方法链接在一起。 .. admonition:: Tip @@ -140,8 +144,8 @@ pipeline 的执行过程如下图所示 """ return proc(copy.deepcopy(self._pipe_res), *args, **kwargs) -`Dataset` 、 `InferenceResult` 和 `PipeResult` 类均有 `apply` method。可用于组合不同阶段的运算过程。 -如下所示,`MinerU` 提供一套组合这些类的计算过程。 +``Dataset`` 、 ``InferenceResult`` 和 ``PipeResult`` 类均有 ``apply`` method。可用于组合不同阶段的运算过程。 +如下所示,``MinerU`` 提供一套组合这些类的计算过程。 .. code:: python @@ -151,7 +155,7 @@ pipeline 的执行过程如下图所示 ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir) -用户可以根据的需求,自行实现一些组合用的函数。比如用户通过 `apply` 方法实现一个统计 `pdf` 文件页数的功能。 +用户可以根据的需求,自行实现一些组合用的函数。比如用户通过 ``apply`` 方法实现一个统计 ``pdf`` 文件页数的功能。 .. code:: python