fix local write pdf file name bug

update cli output files
update requirements
2026-03-27 11:08:32 +07:00 · 2024-06-18 15:44:45 +08:00 · 2024-06-18 15:39:27 +08:00 · 2024-06-18 14:51:06 +08:00 · 2024-06-18 14:45:23 +08:00 · 2024-06-18 14:15:06 +08:00
24 changed files with 561 additions and 181 deletions
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -11,8 +11,51 @@ on:


 jobs:
-  build:

+  update-version:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: master
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Update version.py
+        run: |
+          python update_version.py
+
+      - name: Verify version.py
+        run: |
+          ls -l magic_pdf/libs/version.py
+          cat magic_pdf/libs/version.py
+
+      - name: Commit changes
+        run: |
+          git config --local user.email "moe@myhloli.com"
+          git config --local user.name "myhloli"
+          git add magic_pdf/libs/version.py
+          if git diff-index --quiet HEAD; then
+            echo "No changes to commit"
+          else
+            git commit -m "Update version.py with new version"
+          fi
+        id: commit_changes
+
+      - name: Push changes
+        if: steps.commit_changes.outcome == 'success'
+        env:
+          GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
+        run: |
+          git push origin HEAD:master
+
+  build:
+    needs: [ update-version ]
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
@@ -23,8 +66,14 @@ jobs:
    - name: Checkout code
      uses: actions/checkout@v4
      with:
+        ref: master
        fetch-depth: 0

+    - name: Verify version.py
+      run: |
+        ls -l magic_pdf/libs/version.py
+        cat magic_pdf/libs/version.py
+
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v5
      with:
@@ -70,8 +119,8 @@ jobs:
          files: './dist/*.whl'
        env:
          GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
-      # - name: Publish to PyPI
-      #   uses: pypa/gh-action-pypi-publish@release/v1
-      #   with:
-      #     user: __token__
-      #     password: ${{ secrets.PYPI_TOKEN }}
+
+      - name: Publish distribution to PyPI
+        run: |
+          pip install twine
+          twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
--- a/magic_pdf/cli/magicpdf.py
+++ b/magic_pdf/cli/magicpdf.py
@@ -27,6 +27,7 @@ import sys
 import click
 from loguru import logger
 from pathlib import Path
+from magic_pdf.libs.version import __version__

 from magic_pdf.libs.MakeContentConfig import DropMode
 from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
@@ -43,6 +44,7 @@ from magic_pdf.libs.config_reader import get_local_dir
 from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+import csv

 parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])

@@ -52,13 +54,22 @@ def prepare_env(pdf_file_name, method):
        get_local_dir(), "magic-pdf", pdf_file_name, method
    )

-    local_image_dir = os.path.join(local_parent_dir, "images")
+    local_image_dir = os.path.join(str(local_parent_dir), "images")
    local_md_dir = local_parent_dir
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir


+def write_to_csv(csv_file_path, csv_data):
+    with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
+        # 创建csv writer对象
+        csv_writer = csv.writer(csvfile)
+        # 写入数据
+        csv_writer.writerow(csv_data)
+    print(f"数据已成功追加到 '{csv_file_path}'")
+
+
 def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
    if parse_method == "auto":
        jso_useful_key = {
@@ -75,28 +86,54 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
        sys.exit(1)

    pipe.pipe_classify()
+
+    '''如果没有传入有效的模型数据，则使用内置paddle解析'''
+    if len(model_list) == 0:
+        pipe.pipe_analyze()
+
    pipe.pipe_parse()
    pdf_info = pipe.pdf_mid_data['pdf_info']
    draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
    draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)

+    # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
+    #              [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
+
    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
+    '''写markdown'''
    md_writer.write(
        content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
    )
+    '''写middle_json'''
    md_writer.write(
        content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
-        path=f"{pdf_file_name}.json",
+        path=f"{pdf_file_name}_middle.json",
        mode=AbsReaderWriter.MODE_TXT,
    )
-
-    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
+    '''写model_json'''
    md_writer.write(
-        str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
+        content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
+        path=f"{pdf_file_name}_model.json",
+        mode=AbsReaderWriter.MODE_TXT,
+    )
+    '''写源pdf'''
+    md_writer.write(
+        content=pdf_bytes,
+        path=f"{pdf_file_name}_origin.pdf",
+        mode=AbsReaderWriter.MODE_BIN,
+    )
+    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
+    '''写content_list'''
+    md_writer.write(
+        content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
+        path=f"{pdf_file_name}_content_list.json",
+        mode=AbsReaderWriter.MODE_TXT
    )


@click.group()
+@click.version_option(__version__, "--version", "-v", help="显示版本信息")
+@click.help_option("--help", "-h", help="显示帮助信息")
 def cli():
    pass

@@ -141,7 +178,7 @@ def json_command(json, method):
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)
    local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
-    
+
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
@@ -158,60 +195,60 @@ def json_command(json, method):
    )


-    @cli.command()
-    @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
-    @click.option(
-        "--method",
-        type=parse_pdf_methods,
-        help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
-        default="auto",
-    )
-    def local_json_command(local_json, method):
-        def read_s3_path(s3path):
-            bucket, key = parse_s3path(s3path)
+@cli.command()
+@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
+@click.option(
+    "--method",
+    type=parse_pdf_methods,
+    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
+    default="auto",
+)
+def local_json_command(local_json, method):
+    def read_s3_path(s3path):
+        bucket, key = parse_s3path(s3path)

-            s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
-            s3_rw = S3ReaderWriter(
-                s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
-            )
-            may_range_params = parse_s3_range_params(s3path)
-            if may_range_params is None or 2 != len(may_range_params):
-                byte_start, byte_end = 0, None
-            else:
-                byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
-                byte_end += byte_start - 1
-            return s3_rw.read_jsonl(
-                remove_non_official_s3_args(s3path),
-                byte_start,
-                byte_end,
-                AbsReaderWriter.MODE_BIN,
+        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
+        s3_rw = S3ReaderWriter(
+            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
+        )
+        may_range_params = parse_s3_range_params(s3path)
+        if may_range_params is None or 2 != len(may_range_params):
+            byte_start, byte_end = 0, None
+        else:
+            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
+            byte_end += byte_start - 1
+        return s3_rw.read_jsonl(
+            remove_non_official_s3_args(s3path),
+            byte_start,
+            byte_end,
+            AbsReaderWriter.MODE_BIN,
+        )
+
+    with open(local_json, "r", encoding="utf-8") as f:
+        for json_line in f:
+            jso = json_parse.loads(json_line)
+
+            s3_file_path = jso.get("file_location")
+            if s3_file_path is None:
+                s3_file_path = jso.get("path")
+            pdf_file_name = Path(s3_file_path).stem
+            pdf_data = read_s3_path(s3_file_path)
+            local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
+
+            local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
+                local_md_dir
            )

-        with open(local_json, "r", encoding="utf-8") as f:
-            for json_line in f:
-                jso = json_parse.loads(json_line)
-
-                s3_file_path = jso.get("file_location")
-                if s3_file_path is None:
-                    s3_file_path = jso.get("path")
-                pdf_file_name = Path(s3_file_path).stem
-                pdf_data = read_s3_path(s3_file_path)
-                local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
-
-                local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
-                    local_md_dir
-                )
-
-                _do_parse(
-                    pdf_file_name,
-                    pdf_data,
-                    jso["doc_layout_result"],
-                    method,
-                    local_image_rw,
-                    local_md_rw,
-                    os.path.basename(local_image_dir),
-                    local_md_dir
-                )
+            _do_parse(
+                pdf_file_name,
+                pdf_data,
+                jso["doc_layout_result"],
+                method,
+                local_image_rw,
+                local_md_rw,
+                os.path.basename(local_image_dir),
+                local_md_dir
+            )


@cli.command()
@@ -226,19 +263,28 @@ def json_command(json, method):
    default="auto",
 )
 def pdf_command(pdf, model, method):
-    # 这里处理pdf和模型相关的逻辑
-    if model is None:
-        model = pdf.replace(".pdf", ".json")
-        if not os.path.exists(model):
-            print(f"make sure json {model} existed and place under {os.path.dirname(pdf)}", file=sys.stderr)
-            exit(1)
-
    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)

    pdf_data = read_fn(pdf)
-    jso = json_parse.loads(read_fn(model).decode("utf-8"))
+
+    def get_model_json(model_path):
+        # 这里处理pdf和模型相关的逻辑
+        if model_path is None:
+            model_path = pdf.replace(".pdf", ".json")
+            if not os.path.exists(model_path):
+                logger.warning(f"not found json {model_path} existed, use paddle analyze")
+                # 本地无模型数据则调用内置paddle分析，先传空list，在内部识别到空list再调用paddle
+                model_json = "[]"
+            else:
+                model_json = read_fn(model_path).decode("utf-8")
+        else:
+            model_json = read_fn(model_path).decode("utf-8")
+
+        return model_json
+
+    jso = json_parse.loads(get_model_json(model))
    pdf_file_name = Path(pdf).stem
    local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
 from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min

 TEXT_LEN_THRESHOLD = 100
-AVG_TEXT_LEN_THRESHOLD = 200
+AVG_TEXT_LEN_THRESHOLD = 100
 TEXT_LEN_SAMPLE_RATIO = 0.1  # 抽取0.1的页面进行文字长度统计


@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
            # 如果宽达标，检测是否能竖着拼
            if full_width:
                # 竖着拼需要满足两个前提，左右边界各偏移不能超过 max_offset，第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
-                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
+                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
+                            last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)

            # 如果高达标，检测是否可以横着拼
            if full_height:
                # 横着拼需要满足两个前提，上下边界各偏移不能超过 max_offset，第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
-                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
+                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
+                            last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)

            # Check if the image can be merged with the last image
            if (full_width and close1) or (full_height and close2):
@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
    # 先对每个id出现的次数做个统计
    objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
    # 再去掉出现次数大于10的
-    if total_page >= scan_max_page:# 新的meta_scan只扫描前 scan_max_page 页，页数大于 scan_max_page 当total_page为 scan_max_page
+    if total_page >= scan_max_page:  # 新的meta_scan只扫描前 scan_max_page 页，页数大于 scan_max_page 当total_page为 scan_max_page
        total_page = scan_max_page

-
    repeat_threshold = 2  # 把bad_image的阈值设为2
    # repeat_threshold = min(2, total_page)  # 当total_page为1时，repeat_threshold为1，会产生误判导致所有img变成bad_img
    bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
    # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]):  # 这些透明图片所在的页面上有文字大于阈值
    #     return True

-    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in img_sz_list]  # 过滤掉重复出现的图片
-
+    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
+                   img_sz_list]  # 过滤掉重复出现的图片

    # 有的扫描版会把一页图片拆成很多张，需要先把图拼起来再计算
    img_sz_list = merge_images(img_sz_list, page_width, page_height)

    # 计算每个页面上最大的图的面积，然后计算这个面积占页面面积的比例
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in img_sz_list]
+    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
+                               img_sz_list]
    page_area = page_width * page_height
    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]

-    if len(max_image_area_per_page) >= 0.5 * total_page:   # 阈值从0.8改到0.5，适配3页里面有两页和两页里面有一页的情况
+    if len(max_image_area_per_page) >= 0.5 * total_page:  # 阈值从0.8改到0.5，适配3页里面有两页和两页里面有一页的情况
        # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层，其特点是id都一样
        return False
    else:
        return True


-
 def classify_by_text_len(text_len_list: list, total_page: int):
    """
    随机抽取10%的页面，如果少于5个页面，那么就取全部页面。
@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
    is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
    return is_text_pdf

+
 def classify_by_avg_words(text_len_list: list):
    """
    补充规则，如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD，就不是文字pdf
@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):

    return is_text_pdf

+
 def classify_by_img_num(img_sz_list: list, img_num_list: list):
    """
    补充规则，有一种扫描版本的PDF，每一页都会放所有的扫描页进去，在 metascan 时会被去重，
@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
    # img_sz_list中非空元素的个数小于1，前80%的元素都相等，且最大值大于等于junk_limit_min
    if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:

-    #拿max和min的值,用来判断list内的值是否全都相等
-    # min_imgs = min(img_num_list)
-    # max_imgs = max(img_num_list)
-    #
-    # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
+        #拿max和min的值,用来判断list内的值是否全都相等
+        # min_imgs = min(img_num_list)
+        # max_imgs = max(img_num_list)
+        #
+        # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
        return False  # 如果满足这个条件，一定不是文字版pdf
    else:
        return True  # 不满足这三个条件，可能是文字版pdf，通过其他规则判断
@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
    else:
        return False  # 文本布局未知，默认认为不是文字版pdf

+
 def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
    """
    判断一页是否由细长条组成，有两个条件：
@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
    Returns:
        bool: 如果满足条件的页面的比例小于0.5，返回True，否则返回False
    """
+
    def is_narrow_strip(img):
        x0, y0, x1, y1, _ = img
        width, height = x1 - x0, y1 - y0
@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
    return narrow_strip_pages_ratio < 0.5


-def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
+def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
+             text_layout_list: list):
    """
    这里的图片和页面长度单位是pts
    :param total_page:
@@ -324,7 +330,9 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
    elif not any(results.values()):
        return False, results
    else:
-        logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
+        logger.warning(
+            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
+            file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
        return False, results


--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
@@ -1,6 +1,7 @@
 import datetime
 import json
 import os, re, configparser
+import subprocess
 import time

 import boto3
@@ -11,6 +12,7 @@ from botocore.config import Config
 import fitz # 1.23.9中已经切换到rebase
 # import fitz_old as fitz  # 使用1.23.9之前的pymupdf库

+
 def get_delta_time(input_time):
    return round(time.time() - input_time, 2)

--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
@@ -1,7 +1,6 @@
-import pycld2 as cld2
 import regex
 import unicodedata
-
+from fast_langdetect import detect_langs

 RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")

@@ -13,17 +12,13 @@ def remove_bad_chars(text):
 def detect_lang(text: str) -> str:
    if len(text) == 0:
        return ""
-
    try:
-        _, _, details = cld2.detect(text)
+        lang_upper = detect_langs(text)
    except:
-        # cld2 doesn't like control characters
-        # https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
-        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
-        _, _, details = cld2.detect(html_no_ctrl_chars)
-    lang = ""
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+        lang_upper = detect_langs(html_no_ctrl_chars)
    try:
-        lang = details[0][1].lower()
+        lang = lang_upper.lower()
    except:
        lang = ""
    return lang
@@ -33,4 +28,4 @@ if __name__ == '__main__':
    print(detect_lang("This is a test."))
    print(detect_lang("<html>This is a test</html>"))
    print(detect_lang("这个是中文测试。"))
-    print(detect_lang("<html>这个是中文测试。</html>"))
+    print(detect_lang("<html>这个是中文测试。</html>"))
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
@@ -0,0 +1 @@
+__version__ = "0.5.4"
--- a/magic_pdf/model/360_layout_analysis.py
+++ b/magic_pdf/model/360_layout_analysis.py
@@ -0,0 +1,8 @@
+from ultralytics import YOLO
+
+image_path = ''  # 待预测图片路径
+model_path = ''  # 权重路径
+model = YOLO(model_path)
+
+result = model(image_path, save=True, conf=0.5, save_crop=False, line_width=2)
+print(result)
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -0,0 +1,61 @@
+import fitz
+import cv2
+from PIL import Image
+import numpy as np
+
+from magic_pdf.model.model_list import MODEL
+from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
+
+
+def dict_compare(d1, d2):
+    return d1.items() == d2.items()
+
+
+def remove_duplicates_dicts(lst):
+    unique_dicts = []
+    for dict_item in lst:
+        if not any(
+                dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
+        ):
+            unique_dicts.append(dict_item)
+    return unique_dicts
+
+
+def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
+    images = []
+    with fitz.open("pdf", pdf_bytes) as doc:
+        for index in range(0, doc.page_count):
+            page = doc[index]
+            mat = fitz.Matrix(dpi / 72, dpi / 72)
+            pm = page.get_pixmap(matrix=mat, alpha=False)
+
+            # if width or height > 2000 pixels, don't enlarge the image
+            # if pm.width > 2000 or pm.height > 2000:
+            #     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+
+            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
+            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+            img_dict = {"img": img, "width": pm.width, "height": pm.height}
+            images.append(img_dict)
+    return images
+
+
+def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
+    images = load_images_from_pdf(pdf_bytes)
+    custom_model = None
+    if model == MODEL.Paddle:
+        custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
+    else:
+        pass
+    model_json = []
+    for index, img_dict in enumerate(images):
+        img = img_dict["img"]
+        page_width = img_dict["width"]
+        page_height = img_dict["height"]
+        result = custom_model(img)
+        page_info = {"page_no": index, "height": page_height, "width": page_width}
+        page_dict = {"layout_dets": result, "page_info": page_info}
+
+        model_json.append(page_dict)
+
+    return model_json
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
@@ -37,7 +37,14 @@ class MagicModel:
            )
            layout_dets = model_page_info["layout_dets"]
            for layout_det in layout_dets:
-                x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
+
+                if layout_det.get("bbox") is not None:
+                    # 兼容直接输出bbox的模型数据,如paddle
+                    x0, y0, x1, y1 = layout_det["bbox"]
+                else:
+                    # 兼容直接输出poly的模型数据，如xxx
+                    x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
+
                bbox = [
                    int(x0 / horizontal_scale_ratio),
                    int(y0 / vertical_scale_ratio),
--- a/magic_pdf/model/model_list.py
+++ b/magic_pdf/model/model_list.py
@@ -0,0 +1,2 @@
+class MODEL:
+    Paddle = "pp_structure_v2"
--- a/magic_pdf/model/pp_structure_v2.py
+++ b/magic_pdf/model/pp_structure_v2.py
@@ -0,0 +1,75 @@
+import random
+
+from loguru import logger
+from paddleocr import PPStructure
+
+
+def region_to_bbox(region):
+    x0 = region[0][0]
+    y0 = region[0][1]
+    x1 = region[2][0]
+    y1 = region[2][1]
+    return [x0, y0, x1, y1]
+
+
+class CustomPaddleModel:
+    def __init__(self, ocr: bool = False, show_log: bool = False):
+        self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
+
+    def __call__(self, img):
+        result = self.model(img)
+        spans = []
+        for line in result:
+            line.pop("img")
+            """
+            为paddle输出适配type no.    
+            title: 0 # 标题
+            text: 1 # 文本
+            header: 2 # abandon
+            footer: 2 # abandon
+            reference: 1 # 文本 or abandon
+            equation: 8 # 行间公式 block
+            equation: 14 # 行间公式 text
+            figure: 3 # 图片
+            figure_caption: 4 # 图片描述
+            table: 5 # 表格
+            table_caption: 6 # 表格描述
+            """
+            if line["type"] == "title":
+                line["category_id"] = 0
+            elif line["type"] in ["text", "reference"]:
+                line["category_id"] = 1
+            elif line["type"] == "figure":
+                line["category_id"] = 3
+            elif line["type"] == "figure_caption":
+                line["category_id"] = 4
+            elif line["type"] == "table":
+                line["category_id"] = 5
+            elif line["type"] == "table_caption":
+                line["category_id"] = 6
+            elif line["type"] == "equation":
+                line["category_id"] = 8
+            elif line["type"] in ["header", "footer"]:
+                line["category_id"] = 2
+            else:
+                logger.warning(f"unknown type: {line['type']}")
+
+            # 兼容不输出score的paddleocr版本
+            if line.get("score") is None:
+                line["score"] = 0.5 + random.random() * 0.5
+
+            res = line.pop("res", None)
+            if res is not None and len(res) > 0:
+                for span in res:
+                    new_span = {
+                        "category_id": 15,
+                        "bbox": region_to_bbox(span["text_region"]),
+                        "score": span["confidence"],
+                        "text": span["text"],
+                    }
+                    spans.append(new_span)
+
+        if len(spans) > 0:
+            result.extend(spans)
+
+        return result
--- a/magic_pdf/para/para_split_v2.py
+++ b/magic_pdf/para/para_split_v2.py
@@ -1,7 +1,7 @@
 from sklearn.cluster import DBSCAN
 import numpy as np
 from loguru import logger
-
+import re
 from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 from magic_pdf.model.magic_model import MagicModel
@@ -106,16 +106,19 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
        3. 如果非顶格，首字符大写，编码为2
        4. 如果非顶格，首字符非大写编码为3
        """
+        if len(lines) > 0:
+            x_map_tag_dict, min_x_tag = cluster_line_x(lines)
        for l in lines:
-            first_char = __get_span_text(l['spans'][0])[0]
+            span_text = __get_span_text(l['spans'][0])
+            first_char = span_text[0]
            layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
            if not layout:
                line_fea_encode.append(0)
            else:
-                layout_left = layout[0]
-                if l['bbox'][0] == layout_left:
+                #
+                if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
                    # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
-                    if not first_char.isalnum():
+                    if not first_char.isalnum() or if_match_reference_list(span_text):
                        line_fea_encode.append(1)
                    else:
                        line_fea_encode.append(4)
@@ -144,6 +147,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):

        return split_indices(total_lines, list_indice), list_start_idx

+def cluster_line_x(lines: list) -> dict:
+    """
+    对一个block内所有lines的bbox的x0聚类
+    """
+    min_distance = 5
+    min_sample = 1
+    x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
+    x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
+    x0_uniq_label = np.unique(x0_clusters.labels_)
+    #x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
+    x0_2_new_val = {}  # 存储旧值对应的新值映射
+    min_x0 = round(lines[0]["bbox"][0])
+    for label in x0_uniq_label:
+        if label == -1:
+            continue
+        x0_index_of_label = np.where(x0_clusters.labels_ == label)
+        x0_raw_val = x0_lst[x0_index_of_label][:, 0]
+        x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
+        x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
+        if x0_new_val < min_x0:
+            min_x0 = x0_new_val
+    return x0_2_new_val, min_x0
+
+def if_match_reference_list(text: str) -> bool:
+    pattern = re.compile(r'^\d+\..*')
+    if pattern.match(text):
+        return True
+    else:
+        return False
+

 def __valign_lines(blocks, layout_bboxes):
    """
@@ -298,7 +331,7 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
                 block["type"] == BlockType.Text for line in
                 block['lines']]
        total_lines = len(lines)
-        if total_lines == 1:
+        if total_lines == 1 or total_lines == 0:
            list_info.append([False, False])
            continue
        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
@@ -315,10 +348,11 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
        """
        for list_start in list_start_line:
            if len(list_start) > 1:
-                for i in range(1, len(list_start)):
+                for i in range(0, len(list_start)):
                    index = list_start[i] - 1
-                    if "content" in lines[index]["spans"][-1]:
-                        lines[index]["spans"][-1]["content"] += '\n\n'
+                    if index >= 0:
+                        if "content" in lines[index]["spans"][-1]:
+                            lines[index]["spans"][-1]["content"] += '\n\n'
        layout_list_info = [False, False]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
        for content_type, start, end in text_segments:
            if content_type == 'list':
@@ -388,20 +422,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
            logger.info(f"连接page {page_num} 内的list")
            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
            may_list_lines = []
-            for j in range(len(next_paras)):
-                lines = next_paras[j].get("lines", [])
-                if len(lines) == 1:  # 只可能是一行，多行情况再需要分析了
-                    if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]:
-                        may_list_lines.append(lines[0])
-                    else:
-                        break
+            lines = next_first_para.get("lines", [])
+
+            for line in lines:
+                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
+                    may_list_lines.append(line)
                else:
                    break
            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
            if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
                pre_last_para.extend(may_list_lines)
-                blocks_group[i] = blocks_group[i][len(may_list_lines):]
-                # layout_paras[i] = layout_paras[i][len(may_list_lines):]
+                next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]

    return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]]  # 同时还返回了这个页面级别的开头、结尾是不是列表的信息

@@ -422,18 +453,14 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
        logger.info(f"连接page {page_num} 内的list")
        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
        may_list_lines = []
-        for j in range(len(next_page_paras[0])):
-            next_page_block_j = next_page_paras[0][j]
-            if next_page_block_j["type"] != BlockType.Text:
-                break
-            lines = next_page_block_j["lines"]
-            if len(lines) == 1:  # 只可能是一行，多行情况再需要分析了
-                if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], next_page_layout_bbox)[0]:
-                    may_list_lines.append(lines[0])
+        next_page_first_para = next_page_paras[0][0]
+        if next_page_first_para["type"] == BlockType.Text:
+            lines = next_page_first_para["lines"]
+            for line in lines:
+                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
+                    may_list_lines.append(line)
                else:
                    break
-            else:
-                break
        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
        if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
            #pre_page_paras[-1].append(may_list_lines)
@@ -442,7 +469,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
                for span in line["spans"]:
                    span[CROSS_PAGE] = True
            pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
-            next_page_paras[0] = next_page_paras[0][len(may_list_lines):]
+            next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
            return True

    return False
@@ -471,7 +498,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
    if len(blocks_group) == 0:
        return connected_layout_blocks

-    #connected_layout_paras.append(layout_paras[0])
    connected_layout_blocks.append(blocks_group[0])
    for i in range(1, len(blocks_group)):
        try:
@@ -484,6 +510,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
            if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
                connected_layout_blocks.append(blocks_group[i])
                continue
+            if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
+                connected_layout_blocks.append(blocks_group[i])
+                continue
            pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
            next_first_line = blocks_group[i][0]["lines"][0]
        except Exception as e:
@@ -505,7 +534,7 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):

        pre_last_line_text = pre_last_line_text.strip()
        next_first_line_text = next_first_line_text.strip()
-        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
+        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
                next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
            #connected_layout_paras[-1][-1].extend(layout_paras[i][0])
@@ -557,8 +586,15 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
        # 不是文本，不连接
        return False

-    pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)[2]
-    next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)[0]
+    pre_x2_max_bbox = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)
+    if not pre_x2_max_bbox:
+        return False
+    next_x0_min_bbox = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)
+    if not next_x0_min_bbox:
+        return False
+
+    pre_x2_max = pre_x2_max_bbox[2]
+    next_x0_min = next_x0_min_bbox[0]

    pre_last_line_text = pre_last_line_text.strip()
    next_first_line_text = next_first_line_text.strip()
--- a/magic_pdf/pdf_parse_union_core.py
+++ b/magic_pdf/pdf_parse_union_core.py
@@ -111,6 +111,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
    spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)

    '''将所有区块的bbox整理到一起'''
+    # @todo interline_equation_blocks参数不够准，后面切换到interline_equations上
    if len(interline_equation_blocks) > 0:
        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
--- a/magic_pdf/pipe/AbsPipe.py
+++ b/magic_pdf/pipe/AbsPipe.py
@@ -33,6 +33,13 @@ class AbsPipe(ABC):
        """
        raise NotImplementedError

+    @abstractmethod
+    def pipe_analyze(self):
+        """
+        有状态的跑模型分析
+        """
+        raise NotImplementedError
+
    @abstractmethod
    def pipe_parse(self):
        """
--- a/magic_pdf/pipe/OCRPipe.py
+++ b/magic_pdf/pipe/OCRPipe.py
@@ -1,6 +1,6 @@
 from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.pipe.AbsPipe import AbsPipe
 from magic_pdf.user_api import parse_ocr_pdf

@@ -13,6 +13,9 @@ class OCRPipe(AbsPipe):
    def pipe_classify(self):
        pass

+    def pipe_analyze(self):
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
+
    def pipe_parse(self):
        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

--- a/magic_pdf/pipe/TXTPipe.py
+++ b/magic_pdf/pipe/TXTPipe.py
@@ -1,4 +1,5 @@
 from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.pipe.AbsPipe import AbsPipe
@@ -13,6 +14,9 @@ class TXTPipe(AbsPipe):
    def pipe_classify(self):
        pass

+    def pipe_analyze(self):
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
+
    def pipe_parse(self):
        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -3,6 +3,7 @@ import json
 from loguru import logger

 from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.libs.commons import join_path
@@ -15,14 +16,24 @@ class UNIPipe(AbsPipe):
    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
        self.pdf_type = jso_useful_key["_pdf_type"]
        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
+        if len(self.model_list) == 0:
+            self.input_model_is_empty = True
+        else:
+            self.input_model_is_empty = False

    def pipe_classify(self):
        self.pdf_type = AbsPipe.classify(self.pdf_bytes)

+    def pipe_analyze(self):
+        if self.pdf_type == self.PIP_TXT:
+            self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
+        elif self.pdf_type == self.PIP_OCR:
+            self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
+
    def pipe_parse(self):
        if self.pdf_type == self.PIP_TXT:
            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
-                                                is_debug=self.is_debug)
+                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty)
        elif self.pdf_type == self.PIP_OCR:
            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                              is_debug=self.is_debug)
--- a/magic_pdf/pre_proc/equations_replace.py
+++ b/magic_pdf/pre_proc/equations_replace.py
@@ -107,6 +107,7 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
        or y0_1 > y1_2
    )  # box1在box2的下边

+
 def remove_text_block_overlap_interline_equation_bbox(
    interline_eq_bboxes, pymu_block_list
 ):
@@ -122,10 +123,10 @@ def remove_text_block_overlap_interline_equation_bbox(
                deleted_chars = []
                for char in span["chars"]:
                    if any(
-                        [
-                            _is_in_or_part_overlap(char["bbox"], eq_bbox["bbox"])
-                            for eq_bbox in interline_eq_bboxes
-                        ]
+                            [
+                                (calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
+                                for eq_bbox in interline_eq_bboxes
+                            ]
                    ):
                        deleted_chars.append(char)
                # 检查span里没有char则删除这个span
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -36,6 +36,9 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
    '''任何框体与舍弃框重叠，优先信任舍弃框'''
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
+    # @todo interline_equation 与title或text框冲突的情况，分两种情况处理
+    '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
+    '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''

    '''discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）'''
    for discarded in discarded_blocks:
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -160,12 +160,12 @@ def fill_spans_in_blocks(blocks, spans, radio):
                block_spans.append(span)

        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
-        displayed_list = []
-        text_inline_lines = []
-        modify_y_axis(block_spans, displayed_list, text_inline_lines)
+        # displayed_list = []
+        # text_inline_lines = []
+        # modify_y_axis(block_spans, displayed_list, text_inline_lines)

        '''模型识别错误的行间公式, type类型转换成行内公式'''
-        block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
+        # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)

        '''bbox去除粘连'''  # 去粘连会影响span的bbox，导致后续fill的时候出错
        # block_spans = remove_overlap_between_bbox_for_span(block_spans)
@@ -196,8 +196,10 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
            block = fix_image_block(block, img_blocks)
        elif block_type == BlockType.Table:
            block = fix_table_block(block, table_blocks)
-        elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
+        elif block_type in [BlockType.Text, BlockType.Title]:
            block = fix_text_block(block)
+        elif block_type == BlockType.InterlineEquation:
+            block = fix_interline_block(block)
        else:
            continue
        fix_blocks.append(block)
@@ -315,6 +317,18 @@ def fix_table_block(block, table_blocks):


 def fix_text_block(block):
+    # 文本block中的公式span都应该转换成行内type
+    for span in block['spans']:
+        if span['type'] == ContentType.InterlineEquation:
+            span['type'] = ContentType.InlineEquation
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block
+
+
+def fix_interline_block(block):
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -1,4 +1,3 @@
-
 """
 用户输入：
    model数组，每个元素代表一个页面
@@ -16,14 +15,16 @@ import re

 from loguru import logger

+from magic_pdf.libs.version import __version__
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw import AbsReaderWriter
 from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
 from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt

-
 PARSE_TYPE_TXT = "txt"
 PARSE_TYPE_OCR = "ocr"

+
 def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
                  **kwargs):
    """
@@ -39,6 +40,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit

    pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT

+    pdf_info_dict["_version_name"] = __version__
+
    return pdf_info_dict


@@ -57,10 +60,13 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit

    pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR

+    pdf_info_dict["_version_name"] = __version__
+
    return pdf_info_dict


 def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
+                    input_model_is_empty: bool = False,
                    *args, **kwargs):
    """
    ocr和文本混合的pdf，全部解析出来
@@ -88,7 +94,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
                    for span in line['spans']:
                        text_all += span['content']

-    def calculate_garbled_rate(text):
+    def calculate_not_common_character_rate(text):
        garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
        # 计算乱码字符的数量
        garbage_count = len(garbage_regex.findall(text))
@@ -97,10 +103,30 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
            return 0  # 避免除以零的错误
        return garbage_count / total

-    garbled_rate = calculate_garbled_rate(text_all)
+    def calculate_not_printable_rate(text):
+        printable_text = ""
+        for c in text:
+            if c.isprintable():
+                printable_text += c
+        printable_total = len(printable_text)
+        total = len(text)
+        if total == 0:
+            return 0  # 避免除以零的错误
+        return (total - printable_total) / total

-    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or garbled_rate > 0.8:
+    not_common_character_rate = calculate_not_common_character_rate(text_all)
+    not_printable_rate = calculate_not_printable_rate(text_all)
+    pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
+    pdf_info_dict["_not_printable_rate"] = not_printable_rate
+    logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
+    # not_common_character_rate对小语种可能会有误伤，not_printable_rate对小语种较为友好
+    if (pdf_info_dict is None
+        or pdf_info_dict.get("_need_drop", False)
+        or not_printable_rate > 0.02  # 参考一些正常的pdf，这个值没有超过0.01的，阈值设为0.02
+    ):
        logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
+        if input_model_is_empty:
+            pdf_models = doc_analyze(pdf_bytes, ocr=True)
        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
        if pdf_info_dict is None:
            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
@@ -109,4 +135,6 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
    else:
        pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT

+    pdf_info_dict["_version_name"] = __version__
+
    return pdf_info_dict
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,16 +2,16 @@ boto3>=1.28.43
 Brotli>=1.1.0
 click>=8.1.7
 Distance>=0.1.3
-PyMuPDF>=1.24.3
+PyMuPDF>=1.24.5
 loguru>=0.6.0
 matplotlib>=3.8.3
 numpy>=1.21.6
 pandas>=1.3.5
-pycld2>=0.41
+fast-langdetect>=0.1.1
 regex>=2023.12.25
 termcolor>=2.4.0
 wordninja>=2.0.0
 scikit-learn>=1.0.2
 nltk==3.8.1
 s3pathlib>=2.1.1
-pytest
+paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,7 @@
 from setuptools import setup, find_packages
-import subprocess
+from magic_pdf.libs.version import __version__
+
+
 def parse_requirements(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
@@ -8,37 +10,26 @@ def parse_requirements(filename):

    for line in lines:
        if "http" in line:
-            pkg_name_with_version = line.split('/')[-1].split('-')[0]
-            requires.append(pkg_name_with_version)
+            pkg_name_without_url = line.split('@')[0].strip()
+            requires.append(pkg_name_without_url)
        else:
            requires.append(line)

    return requires

-def get_version():
-    command = ["git", "describe", "--tags"]
-    try:
-        version = subprocess.check_output(command).decode().strip()
-        version_parts = version.split("-")
-        if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
-            return version_parts[1]
-        else:
-            raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
-    except Exception as e:
-        print(e)
-        return "0.0.0"

-
-requires = parse_requirements('requirements.txt')
-
-setup(
-    name="magic_pdf",  # 项目名
-    # version="0.1.3",  # 版本号
-    version=get_version(),  # 自动从tag中获取版本号
-    packages=find_packages(),  # 包含所有的包
-    install_requires=requires,  # 项目依赖的第三方库
-    python_requires=">=3.9",  # 项目依赖的 Python 版本
-    # entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
-    include_package_data=True,  # 是否包含非代码文件，如数据文件、配置文件等
-    zip_safe=False,  # 是否使用 zip 文件格式打包，一般设为 False
-)
+if __name__ == '__main__':
+    setup(
+        name="magic_pdf",  # 项目名
+        version=__version__,  # 自动从tag中获取版本号
+        packages=find_packages(),  # 包含所有的包
+        install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
+        extras_require={
+            "gpu": ["paddlepaddle-gpu"],
+            "cpu": ["paddlepaddle"],
+        },
+        python_requires=">=3.9",  # 项目依赖的 Python 版本
+        # entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
+        include_package_data=True,  # 是否包含非代码文件，如数据文件、配置文件等
+        zip_safe=False,  # 是否使用 zip 文件格式打包，一般设为 False
+    )
--- a/update_version.py
+++ b/update_version.py
@@ -0,0 +1,27 @@
+import os
+import subprocess
+
+
+def get_version():
+    command = ["git", "describe", "--tags"]
+    try:
+        version = subprocess.check_output(command).decode().strip()
+        version_parts = version.split("-")
+        if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
+            return version_parts[1]
+        else:
+            raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
+    except Exception as e:
+        print(e)
+        return "0.0.0"
+
+
+def write_version_to_commons(version):
+    commons_path = os.path.join(os.path.dirname(__file__), 'magic_pdf', 'libs', 'version.py')
+    with open(commons_path, 'w') as f:
+        f.write(f'__version__ = "{version}"\n')
+
+
+if __name__ == '__main__':
+    version_name = get_version()
+    write_version_to_commons(version_name)
Author	SHA1	Message	Date
赵小蒙	5f313bd0b4	fix local write pdf file name bug	2024-06-18 15:44:45 +08:00
赵小蒙	3b7342b894	update cli output files	2024-06-18 15:39:27 +08:00
赵小蒙	9dc5033cf7	update requirements	2024-06-18 14:51:06 +08:00
赵小蒙	389826c5fe	update custom model framework	2024-06-18 14:45:23 +08:00
myhloli	c96aa88d13	Merge pull request #119 from icecraft/feat/parallel_paddle feat: parallelize paddle	2024-06-18 14:15:06 +08:00
blue	738f9274a9	feat: parallelize paddle	2024-06-18 13:57:40 +08:00
赵小蒙	084dc22ab1	update AVG_TEXT_LEN_THRESHOLD 200->100	2024-06-18 10:46:54 +08:00
赵小蒙	6c52856d2a	remove useless import	2024-06-17 19:16:19 +08:00
赵小蒙	c69f414b20	update pypi upload logic	2024-06-17 12:38:40 +08:00
赵小蒙	0306d66d25	update pypi upload logic	2024-06-17 12:32:47 +08:00
赵小蒙	35d39735da	update pypi upload logic	2024-06-17 12:21:03 +08:00
myhloli	e57a9d87c7	Update version.py with new version	2024-06-17 04:11:27 +00:00
赵小蒙	ce0d99057a	use fast_langdetect replace cld2	2024-06-17 12:09:14 +08:00
赵小蒙	0606301412	make paddle analyze mode adaptation cli input mode to improve analyze speed	2024-06-17 11:37:48 +08:00
赵小蒙	39b46ea980	update github workflow to Publish to PyPI	2024-06-14 18:16:41 +08:00
赵小蒙	aeef64b482	update github workflow	2024-06-14 17:57:20 +08:00
赵小蒙	d2e8271322	Merge remote-tracking branch 'origin/master'	2024-06-14 11:02:46 +08:00
赵小蒙	d62dd24939	update paddleocr url	2024-06-14 11:02:35 +08:00
myhloli	0c33f2f0d3	Update version.py with new version	2024-06-13 12:28:12 +00:00
赵小蒙	64c628434d	Merge remote-tracking branch 'origin/master'	2024-06-13 20:26:54 +08:00
赵小蒙	a5ff8acea7	update paddleocr to 2.8+ and add layout score output	2024-06-13 20:26:31 +08:00
myhloli	0b97f26552	Merge pull request #118 from papayalove/master 修复分段边界问题	2024-06-13 18:40:01 +08:00
liukaiwen	2284e0d77b	修复分段边界问题	2024-06-13 18:37:59 +08:00
myhloli	f80560ffea	Update version.py with new version	2024-06-12 12:22:04 +00:00
赵小蒙	5aa2e01264	add paddlepaddle in requirements.txt	2024-06-12 20:20:41 +08:00
赵小蒙	384c979d68	update: use paddleocr analyze layout in no model_json input	2024-06-12 20:00:44 +08:00
myhloli	2ad0134ca2	Update version.py with new version	2024-06-11 10:42:51 +00:00
myhloli	0678a8603d	Merge pull request #117 from papayalove/master 修复分段边界问题	2024-06-11 17:21:23 +08:00
liukaiwen	9c6cb7b772	修复分段边界问题	2024-06-11 17:19:56 +08:00
赵小蒙	bf18172d8a	update: model parse support paddle output	2024-06-11 10:58:44 +08:00
赵小蒙	e92de75844	add todo about interline_equation	2024-06-07 11:11:34 +08:00
myhloli	b7a418b538	Update version.py with new version	2024-06-06 08:02:48 +00:00
赵小蒙	3c145ba0ca	fix: some text char removed by interline_equations overlap	2024-06-06 16:00:26 +08:00
myhloli	999b698fca	Update version.py with new version	2024-06-05 07:22:24 +00:00
赵小蒙	9b5b116369	fix: change garbled_rate 0.1 -> 0.02	2024-06-05 15:21:14 +08:00
myhloli	c50fa4dc72	Update version.py with new version	2024-06-04 11:11:30 +00:00
赵小蒙	54f31b65cb	update cli	2024-06-04 19:10:29 +08:00
myhloli	4ce15c44f3	Update version.py with new version	2024-06-04 10:10:12 +00:00
赵小蒙	88f2245d86	update cli	2024-06-04 18:03:12 +08:00
myhloli	bc05526602	Update version.py with new version	2024-06-04 09:21:33 +00:00
赵小蒙	b18e9365fa	Merge remote-tracking branch 'origin/master' # Conflicts: # magic_pdf/libs/version.py	2024-06-04 17:21:01 +08:00
赵小蒙	48b6992b71	update workflow	2024-06-04 17:20:11 +08:00
myhloli	4f6171d19e	Update version.py with new version	2024-06-04 09:15:57 +00:00
赵小蒙	595517054b	update workflow	2024-06-04 17:14:07 +08:00
赵小蒙	705c4dcf30	update workflow	2024-06-04 17:10:59 +08:00
赵小蒙	ff52be3304	update workflow	2024-06-04 17:08:01 +08:00
赵小蒙	a68f4174cd	update workflow	2024-06-04 16:58:19 +08:00
赵小蒙	2d0d5a8208	update workflow	2024-06-04 16:38:47 +08:00
赵小蒙	887a3d989b	update workflow	2024-06-04 16:37:49 +08:00
赵小蒙	6ab1a65a6a	fix error	2024-06-04 16:26:37 +08:00
赵小蒙	48d3032318	fix error	2024-06-04 16:03:06 +08:00
赵小蒙	ddde1b82f2	fix error	2024-06-04 15:49:46 +08:00
赵小蒙	c7a685b302	fix error	2024-06-04 15:12:56 +08:00
赵小蒙	93a59ff4a3	fix error	2024-06-04 15:04:27 +08:00
赵小蒙	ab8413811f	fix error	2024-06-04 12:16:41 +08:00
赵小蒙	e73964fc12	fix error	2024-06-04 12:13:30 +08:00
赵小蒙	b74f17e439	fix error	2024-06-04 12:08:37 +08:00
赵小蒙	20278040a5	fix error	2024-06-04 11:59:43 +08:00
赵小蒙	9d0b4e95de	fix error: version is 0.0.0	2024-06-04 11:51:39 +08:00
赵小蒙	7fd8d97edb	fix error: version is 0.0.0	2024-06-04 11:48:24 +08:00
赵小蒙	1877055672	fix error	2024-06-04 11:44:49 +08:00
赵小蒙	75d0fa3d24	fix error	2024-06-04 11:38:15 +08:00
赵小蒙	07f6c49707	chanage update version logic	2024-06-04 11:33:57 +08:00
赵小蒙	1de37e4c65	add version_name to middle json	2024-06-04 11:15:52 +08:00
赵小蒙	bd1834284e	add version_name to middle json	2024-06-03 18:51:38 +08:00
赵小蒙	496045f361	update annotation	2024-05-31 10:27:23 +08:00
赵小蒙	75478eda89	update setup	2024-05-30 10:26:10 +08:00
赵小蒙	3f3edc39f5	update setup	2024-05-30 10:25:02 +08:00
赵小蒙	97a4e47319	change garbled rate check from not_common_character_rate to not_printable_rate	2024-05-28 18:22:01 +08:00
myhloli	5de372245c	Merge pull request #116 from papayalove/master 修复分段边界问题	2024-05-28 10:26:19 +08:00
Kaiwen Liu	135adac43d	Merge branch 'magicpdf:master' into master	2024-05-28 10:21:11 +08:00
liukaiwen	ba52e33527	修复分段边界问题	2024-05-28 10:20:47 +08:00
myhloli	78ed786794	Merge pull request #115 from papayalove/master 修复边界问题（修复list拼接和reference分行问题）	2024-05-27 15:19:15 +08:00
liukaiwen	4ff09a2fbc	修复边界问题（修复list拼接和reference分行问题）	2024-05-27 15:16:00 +08:00
赵小蒙	f8548a8ea2	update PyMuPDF to 1.24.4	2024-05-27 14:40:17 +08:00
myhloli	10a95bcd05	Merge pull request #114 from papayalove/master 修复list拼接和reference分行问题	2024-05-24 16:34:07 +08:00
liukaiwen	dbdbaf58be	Merge branch 'master' of github.com:papayalove/Magic-PDF	2024-05-24 16:31:11 +08:00
liukaiwen	afe92f07d6	修复list拼接和reference分行问题	2024-05-24 16:31:00 +08:00