chanage update version logic

add version_name to middle json
2026-03-27 11:08:32 +07:00 · 2024-06-04 11:33:57 +08:00 · 2024-06-04 11:15:52 +08:00 · 2024-06-03 18:51:38 +08:00 · 2024-05-31 10:27:23 +08:00 · 2024-05-30 10:26:10 +08:00
8 changed files with 152 additions and 64 deletions
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -11,6 +11,35 @@ on:


 jobs:
+
+  update-version:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Update version.py
+        run: |
+          python update_version.py
+
+      - name: Commit changes
+        run: |
+          git config --local user.email "moe@myhloli.com"
+          git config --local user.name "myhloli"
+          git add version.py
+          git commit -m "Update version.py with new version"
+
+      - name: Push changes
+        env:
+          GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
+        run: |
+          git push
  build:

    runs-on: ubuntu-latest
--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
@@ -1,6 +1,7 @@
 import datetime
 import json
 import os, re, configparser
+import subprocess
 import time

 import boto3
@@ -11,6 +12,7 @@ from botocore.config import Config
 import fitz # 1.23.9中已经切换到rebase
 # import fitz_old as fitz  # 使用1.23.9之前的pymupdf库

+
 def get_delta_time(input_time):
    return round(time.time() - input_time, 2)

--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
@@ -0,0 +1 @@
+__version__ = "0.0.0"
--- a/magic_pdf/para/para_split_v2.py
+++ b/magic_pdf/para/para_split_v2.py
@@ -1,7 +1,7 @@
 from sklearn.cluster import DBSCAN
 import numpy as np
 from loguru import logger
-
+import re
 from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 from magic_pdf.model.magic_model import MagicModel
@@ -106,16 +106,19 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
        3. 如果非顶格，首字符大写，编码为2
        4. 如果非顶格，首字符非大写编码为3
        """
+        if len(lines) > 0:
+            x_map_tag_dict, min_x_tag = cluster_line_x(lines)
        for l in lines:
-            first_char = __get_span_text(l['spans'][0])[0]
+            span_text = __get_span_text(l['spans'][0])
+            first_char = span_text[0]
            layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
            if not layout:
                line_fea_encode.append(0)
            else:
-                layout_left = layout[0]
-                if l['bbox'][0] == layout_left:
+                #
+                if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
                    # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
-                    if not first_char.isalnum():
+                    if not first_char.isalnum() or if_match_reference_list(span_text):
                        line_fea_encode.append(1)
                    else:
                        line_fea_encode.append(4)
@@ -144,6 +147,36 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):

        return split_indices(total_lines, list_indice), list_start_idx

+def cluster_line_x(lines: list) -> dict:
+    """
+    对一个block内所有lines的bbox的x0聚类
+    """
+    min_distance = 5
+    min_sample = 1
+    x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
+    x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
+    x0_uniq_label = np.unique(x0_clusters.labels_)
+    #x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
+    x0_2_new_val = {}  # 存储旧值对应的新值映射
+    min_x0 = round(lines[0]["bbox"][0])
+    for label in x0_uniq_label:
+        if label == -1:
+            continue
+        x0_index_of_label = np.where(x0_clusters.labels_ == label)
+        x0_raw_val = x0_lst[x0_index_of_label][:, 0]
+        x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
+        x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
+        if x0_new_val < min_x0:
+            min_x0 = x0_new_val
+    return x0_2_new_val, min_x0
+
+def if_match_reference_list(text: str) -> bool:
+    pattern = re.compile(r'^\d+\..*')
+    if pattern.match(text):
+        return True
+    else:
+        return False
+

 def __valign_lines(blocks, layout_bboxes):
    """
@@ -315,10 +348,11 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
        """
        for list_start in list_start_line:
            if len(list_start) > 1:
-                for i in range(1, len(list_start)):
+                for i in range(0, len(list_start)):
                    index = list_start[i] - 1
-                    if "content" in lines[index]["spans"][-1]:
-                        lines[index]["spans"][-1]["content"] += '\n\n'
+                    if index >= 0:
+                        if "content" in lines[index]["spans"][-1]:
+                            lines[index]["spans"][-1]["content"] += '\n\n'
        layout_list_info = [False, False]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
        for content_type, start, end in text_segments:
            if content_type == 'list':
@@ -388,20 +422,17 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
            logger.info(f"连接page {page_num} 内的list")
            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
            may_list_lines = []
-            for j in range(len(next_paras)):
-                lines = next_paras[j].get("lines", [])
-                if len(lines) == 1:  # 只可能是一行，多行情况再需要分析了
-                    if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]:
-                        may_list_lines.append(lines[0])
-                    else:
-                        break
+            lines = next_first_para.get("lines", [])
+
+            for line in lines:
+                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
+                    may_list_lines.append(line)
                else:
                    break
            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
            if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
                pre_last_para.extend(may_list_lines)
-                blocks_group[i] = blocks_group[i][len(may_list_lines):]
-                # layout_paras[i] = layout_paras[i][len(may_list_lines):]
+                next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]

    return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]]  # 同时还返回了这个页面级别的开头、结尾是不是列表的信息

@@ -422,18 +453,14 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
        logger.info(f"连接page {page_num} 内的list")
        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
        may_list_lines = []
-        for j in range(len(next_page_paras[0])):
-            next_page_block_j = next_page_paras[0][j]
-            if next_page_block_j["type"] != BlockType.Text:
-                break
-            lines = next_page_block_j["lines"]
-            if len(lines) == 1:  # 只可能是一行，多行情况再需要分析了
-                if lines[0]['bbox'][0] > __find_layout_bbox_by_line(lines[0]['bbox'], next_page_layout_bbox)[0]:
-                    may_list_lines.append(lines[0])
+        next_page_first_para = next_page_paras[0][0]
+        if next_page_first_para["type"] == BlockType.Text:
+            lines = next_page_first_para["lines"]
+            for line in lines:
+                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
+                    may_list_lines.append(line)
                else:
                    break
-            else:
-                break
        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
        if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
            #pre_page_paras[-1].append(may_list_lines)
@@ -442,7 +469,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
                for span in line["spans"]:
                    span[CROSS_PAGE] = True
            pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
-            next_page_paras[0] = next_page_paras[0][len(may_list_lines):]
+            next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
            return True

    return False
@@ -471,7 +498,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
    if len(blocks_group) == 0:
        return connected_layout_blocks

-    #connected_layout_paras.append(layout_paras[0])
    connected_layout_blocks.append(blocks_group[0])
    for i in range(1, len(blocks_group)):
        try:
@@ -484,6 +510,9 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
            if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
                connected_layout_blocks.append(blocks_group[i])
                continue
+            if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
+                connected_layout_blocks.append(blocks_group[i])
+                continue
            pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
            next_first_line = blocks_group[i][0]["lines"][0]
        except Exception as e:
@@ -505,7 +534,7 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):

        pre_last_line_text = pre_last_line_text.strip()
        next_first_line_text = next_first_line_text.strip()
-        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
+        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
                next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
            #connected_layout_paras[-1][-1].extend(layout_paras[i][0])
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -16,11 +16,11 @@ import re

 from loguru import logger

+from magic_pdf.libs.version import __version__
 from magic_pdf.rw import AbsReaderWriter
 from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
 from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt

-
 PARSE_TYPE_TXT = "txt"
 PARSE_TYPE_OCR = "ocr"

@@ -39,6 +39,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit

    pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT

+    pdf_info_dict["_version_name"] = __version__
+
    return pdf_info_dict


@@ -57,6 +59,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit

    pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR

+    pdf_info_dict["_version_name"] = __version__
+
    return pdf_info_dict


@@ -88,7 +92,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
                    for span in line['spans']:
                        text_all += span['content']

-    def calculate_garbled_rate(text):
+    def calculate_not_common_character_rate(text):
        garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
        # 计算乱码字符的数量
        garbage_count = len(garbage_regex.findall(text))
@@ -97,9 +101,18 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
            return 0  # 避免除以零的错误
        return garbage_count / total

-    garbled_rate = calculate_garbled_rate(text_all)
+    def calculate_not_printable_rate(text):
+        printable = sum(1 for c in text if c.isprintable())
+        total = len(text)
+        if total == 0:
+            return 0  # 避免除以零的错误
+        return (total - printable) / total

-    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or garbled_rate > 0.8:
+    # not_common_character_rate = calculate_not_common_character_rate(text_all)
+    not_printable_rate = calculate_not_printable_rate(text_all)
+    # 测试乱码pdf，not_common_character_rate > 0.95, not_printable_rate > 0.15
+    # not_common_character_rate对小语种可能会有误伤，not_printable_rate对小语种较为友好
+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or not_printable_rate > 0.1:
        logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
        if pdf_info_dict is None:
@@ -109,4 +122,6 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
    else:
        pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT

+    pdf_info_dict["_version_name"] = __version__
+
    return pdf_info_dict
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ boto3>=1.28.43
 Brotli>=1.1.0
 click>=8.1.7
 Distance>=0.1.3
-PyMuPDF>=1.24.3
+PyMuPDF>=1.24.4
 loguru>=0.6.0
 matplotlib>=3.8.3
 numpy>=1.21.6
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,10 @@
-from setuptools import setup, find_packages
+import os
 import subprocess
+
+from setuptools import setup, find_packages
+from magic_pdf.libs.version import __version__
+
+
 def parse_requirements(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
@@ -8,37 +13,22 @@ def parse_requirements(filename):

    for line in lines:
        if "http" in line:
-            pkg_name_with_version = line.split('/')[-1].split('-')[0]
-            requires.append(pkg_name_with_version)
+            pkg_name_without_url = line.split('@')[0].strip()
+            requires.append(pkg_name_without_url)
        else:
            requires.append(line)

    return requires

-def get_version():
-    command = ["git", "describe", "--tags"]
-    try:
-        version = subprocess.check_output(command).decode().strip()
-        version_parts = version.split("-")
-        if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
-            return version_parts[1]
-        else:
-            raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
-    except Exception as e:
-        print(e)
-        return "0.0.0"

-
-requires = parse_requirements('requirements.txt')
-
-setup(
-    name="magic_pdf",  # 项目名
-    # version="0.1.3",  # 版本号
-    version=get_version(),  # 自动从tag中获取版本号
-    packages=find_packages(),  # 包含所有的包
-    install_requires=requires,  # 项目依赖的第三方库
-    python_requires=">=3.9",  # 项目依赖的 Python 版本
-    # entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
-    include_package_data=True,  # 是否包含非代码文件，如数据文件、配置文件等
-    zip_safe=False,  # 是否使用 zip 文件格式打包，一般设为 False
-)
+if __name__ == '__main__':
+    setup(
+        name="magic_pdf",  # 项目名
+        version=__version__,  # 自动从tag中获取版本号
+        packages=find_packages(),  # 包含所有的包
+        install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
+        python_requires=">=3.9",  # 项目依赖的 Python 版本
+        # entry_points={"console_scripts": ["my_command=my_project.main:run"]}, # 项目提供的可执行命令
+        include_package_data=True,  # 是否包含非代码文件，如数据文件、配置文件等
+        zip_safe=False,  # 是否使用 zip 文件格式打包，一般设为 False
+    )
--- a/update_version.py
+++ b/update_version.py
@@ -0,0 +1,22 @@
+import os
+import subprocess
+
+
+def get_version():
+    command = ["git", "describe", "--tags"]
+    try:
+        version = subprocess.check_output(command).decode().strip()
+        version_parts = version.split("-")
+        if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
+            return version_parts[1]
+        else:
+            raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
+    except Exception as e:
+        print(e)
+        return "0.0.0"
+
+
+def write_version_to_commons(version):
+    commons_path = os.path.join(os.path.dirname(__file__), 'magic_pdf', 'libs', 'version.py')
+    with open(commons_path, 'w') as f:
+        f.write(f'__version__ = "{version}"\n')
Author	SHA1	Message	Date
赵小蒙	07f6c49707	chanage update version logic	2024-06-04 11:33:57 +08:00
赵小蒙	1de37e4c65	add version_name to middle json	2024-06-04 11:15:52 +08:00
赵小蒙	bd1834284e	add version_name to middle json	2024-06-03 18:51:38 +08:00
赵小蒙	496045f361	update annotation	2024-05-31 10:27:23 +08:00
赵小蒙	75478eda89	update setup	2024-05-30 10:26:10 +08:00
赵小蒙	3f3edc39f5	update setup	2024-05-30 10:25:02 +08:00
赵小蒙	97a4e47319	change garbled rate check from not_common_character_rate to not_printable_rate	2024-05-28 18:22:01 +08:00
myhloli	5de372245c	Merge pull request #116 from papayalove/master 修复分段边界问题	2024-05-28 10:26:19 +08:00
Kaiwen Liu	135adac43d	Merge branch 'magicpdf:master' into master	2024-05-28 10:21:11 +08:00
liukaiwen	ba52e33527	修复分段边界问题	2024-05-28 10:20:47 +08:00
myhloli	78ed786794	Merge pull request #115 from papayalove/master 修复边界问题（修复list拼接和reference分行问题）	2024-05-27 15:19:15 +08:00
liukaiwen	4ff09a2fbc	修复边界问题（修复list拼接和reference分行问题）	2024-05-27 15:16:00 +08:00
赵小蒙	f8548a8ea2	update PyMuPDF to 1.24.4	2024-05-27 14:40:17 +08:00
myhloli	10a95bcd05	Merge pull request #114 from papayalove/master 修复list拼接和reference分行问题	2024-05-24 16:34:07 +08:00
liukaiwen	dbdbaf58be	Merge branch 'master' of github.com:papayalove/Magic-PDF	2024-05-24 16:31:11 +08:00
liukaiwen	afe92f07d6	修复list拼接和reference分行问题	2024-05-24 16:31:00 +08:00