Merge pull request #2254 from opendatalab/dev

Dev
Merge pull request #2253 from myhloli/dev
2026-03-27 19:18:34 +07:00 · 2025-04-16 17:59:02 +08:00 · 2025-04-16 17:58:29 +08:00 · 2025-04-16 17:57:17 +08:00 · 2025-04-16 17:49:52 +08:00 · 2025-04-16 17:48:30 +08:00
14 changed files with 237 additions and 35 deletions
--- a/README.md
+++ b/README.md
@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
 </div>

 # Changelog
+- 2025/04/16 1.3.4 Released
+  - Slightly improved the speed of OCR detection by removing some unused blocks.
+  - Fixed page-level sorting errors caused by footnotes in certain cases.
 - 2025/04/12 1.3.2 released
  - Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
  - Optimized memory usage during batch inference.
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -47,6 +47,9 @@
 </div>

 # 更新记录
+- 2025/04/16 1.3.4 发布
+  - 通过移除一些无用的块，小幅提升了ocr-det的速度
+  - 修复部分情况下由footnote导致的页面内排序错误
 - 2025/04/12 1.3.2 发布
  - 修复了windows系统下，在python3.13环境安装时一些依赖包版本不兼容的问题
  - 优化批量推理时的内存占用
--- a/docker/ascend_npu/Dockerfile
+++ b/docker/ascend_npu/Dockerfile
@@ -35,6 +35,7 @@ RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/m
    cp magic-pdf.template.json /root/magic-pdf.json && \
    source /opt/mineru_venv/bin/activate && \
    pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
+    pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple && \
    pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple && \
    wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
    pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
--- a/docker/china/Dockerfile
+++ b/docker/china/Dockerfile
@@ -18,7 +18,17 @@ RUN apt-get update && \
        wget \
        git \
        libgl1 \
+        libreoffice \
+        fonts-noto-cjk \
+        fonts-wqy-zenhei \
+        fonts-wqy-microhei \
+        ttf-mscorefonts-installer \
+        fontconfig \
        libglib2.0-0 \
+        libxrender1 \
+        libsm6 \
+        libxext6 \
+        poppler-utils \
        && rm -rf /var/lib/apt/lists/*

 # Set Python 3.10 as the default python3
--- a/docker/global/Dockerfile
+++ b/docker/global/Dockerfile
@@ -18,7 +18,17 @@ RUN apt-get update && \
        wget \
        git \
        libgl1 \
+        libreoffice \
+        fonts-noto-cjk \
+        fonts-wqy-zenhei \
+        fonts-wqy-microhei \
+        ttf-mscorefonts-installer \
+        fontconfig \
        libglib2.0-0 \
+        libxrender1 \
+        libsm6 \
+        libxext6 \
+        poppler-utils \
        && rm -rf /var/lib/apt/lists/*

 # Set Python 3.10 as the default python3
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
@@ -1 +1 @@
-__version__ = "1.3.1"
+__version__ = "1.3.3"
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -147,7 +147,7 @@ def doc_analyze(
            images.append(img_dict['img'])
            page_wh_list.append((img_dict['width'], img_dict['height']))

-    images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(dataset))]
+    images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))]

    if len(images) >= MIN_BATCH_INFERENCE_SIZE:
        batch_size = MIN_BATCH_INFERENCE_SIZE
--- a/magic_pdf/model/sub_modules/model_utils.py
+++ b/magic_pdf/model/sub_modules/model_utils.py
@@ -2,6 +2,8 @@ import time
 import torch
 from loguru import logger
 import numpy as np
+
+from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
 from magic_pdf.libs.clean_memory import clean_memory


@@ -188,9 +190,46 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
    return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx]


+def remove_overlaps_min_blocks(res_list):
+    #  重叠block，小的不能直接删除，需要和大的那个合并成一个更大的。
+    #  删除重叠blocks中较小的那些
+    need_remove = []
+    for res1 in res_list:
+        for res2 in res_list:
+            if res1 != res2:
+                overlap_box = get_minbox_if_overlap_by_ratio(
+                    res1['bbox'], res2['bbox'], 0.8
+                )
+                if overlap_box is not None:
+                    res_to_remove = next(
+                        (res for res in res_list if res['bbox'] == overlap_box),
+                        None,
+                    )
+                    if (
+                        res_to_remove is not None
+                        and res_to_remove not in need_remove
+                    ):
+                        large_res = res1 if res1 != res_to_remove else res2
+                        x1, y1, x2, y2 = large_res['bbox']
+                        sx1, sy1, sx2, sy2 = res_to_remove['bbox']
+                        x1 = min(x1, sx1)
+                        y1 = min(y1, sy1)
+                        x2 = max(x2, sx2)
+                        y2 = max(y2, sy2)
+                        large_res['bbox'] = [x1, y1, x2, y2]
+                        need_remove.append(res_to_remove)
+
+    if len(need_remove) > 0:
+        for res in need_remove:
+            res_list.remove(res)
+
+    return res_list, need_remove
+
+
 def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8):
    """Extract OCR, table and other regions from layout results."""
    ocr_res_list = []
+    text_res_list = []
    table_res_list = []
    table_indices = []
    single_page_mfdetrec_res = []
@@ -204,11 +243,14 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
                "bbox": [int(res['poly'][0]), int(res['poly'][1]),
                         int(res['poly'][4]), int(res['poly'][5])],
            })
-        elif category_id in [0, 1, 2, 4, 6, 7]:  # OCR regions
+        elif category_id in [0, 2, 4, 6, 7]:  # OCR regions
            ocr_res_list.append(res)
        elif category_id == 5:  # Table regions
            table_res_list.append(res)
            table_indices.append(i)
+        elif category_id in [1]:  # Text regions
+            res['bbox'] = [int(res['poly'][0]), int(res['poly'][1]), int(res['poly'][4]), int(res['poly'][5])]
+            text_res_list.append(res)

    # Process tables: merge high IoU tables first, then filter nested tables
    table_res_list, table_indices = merge_high_iou_tables(
@@ -226,6 +268,22 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
        for idx in sorted(to_remove, reverse=True):
            del layout_res[idx]

+    # Remove overlaps in OCR and text regions
+    text_res_list, need_remove = remove_overlaps_min_blocks(text_res_list)
+    for res in text_res_list:
+        # 将res的poly使用bbox重构
+        res['poly'] = [res['bbox'][0], res['bbox'][1], res['bbox'][2], res['bbox'][1],
+                       res['bbox'][2], res['bbox'][3], res['bbox'][0], res['bbox'][3]]
+        # 删除res的bbox
+        del res['bbox']
+
+    ocr_res_list.extend(text_res_list)
+
+    if len(need_remove) > 0:
+        for res in need_remove:
+            del res['bbox']
+            layout_res.remove(res)
+
    return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res


--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
        return [[x0, y0, x1, y1]]


-def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
+def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks):
    page_line_list = []

    def add_lines_to_block(b):
@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
            block['real_lines'] = copy.deepcopy(block['lines'])
            add_lines_to_block(block)

+    for block in footnote_blocks:
+        footnote_block = {'bbox': block[:4]}
+        add_lines_to_block(footnote_block)
+
    if len(page_line_list) > 200:  # layoutreader最高支持512line
        return None

@@ -779,7 +783,7 @@ def parse_page_core(
    # interline_equation_blocks参数不够准，后面切换到interline_equations上
    interline_equation_blocks = []
    if len(interline_equation_blocks) > 0:
-        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
+        all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
            img_body_blocks, img_caption_blocks, img_footnote_blocks,
            table_body_blocks, table_caption_blocks, table_footnote_blocks,
            discarded_blocks,
@@ -790,7 +794,7 @@ def parse_page_core(
            page_h,
        )
    else:
-        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
+        all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
            img_body_blocks, img_caption_blocks, img_footnote_blocks,
            table_body_blocks, table_caption_blocks, table_footnote_blocks,
            discarded_blocks,
@@ -866,7 +870,7 @@ def parse_page_core(
    line_height = get_line_height(fix_blocks)

    """获取所有line并对line排序"""
-    sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
+    sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks)

    """根据line的中位数算block的序列关系"""
    fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -99,11 +99,11 @@ def ocr_prepare_bboxes_for_layout_split_v2(
    all_discarded_blocks = []
    add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)

-    """footnote识别：宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的"""
+    """footnote识别：宽度超过1/3页面宽度的，高度超过10的，处于页面下半30%区域的"""
    footnote_blocks = []
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
-        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
+        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
            footnote_blocks.append([x0, y0, x1, y1])

    """移除在footnote下面的任何框"""
@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
    """将剩余的bbox做分离处理，防止后面分layout时出错"""
    # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
    all_bboxes.sort(key=lambda x: x[0]+x[1])
-    return all_bboxes, all_discarded_blocks
+    return all_bboxes, all_discarded_blocks, footnote_blocks


 def find_blocks_under_footnote(all_bboxes, footnote_blocks):
--- a/magic_pdf/utils/office_to_pdf.py
+++ b/magic_pdf/utils/office_to_pdf.py
@@ -1,6 +1,8 @@
 import os
 import subprocess
+import platform
 from pathlib import Path
+import shutil


 class ConvertToPdfError(Exception):
@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception):
        super().__init__(self.msg)


+# Chinese font list
+REQUIRED_CHS_FONTS = ['SimSun', 'Microsoft YaHei', 'Noto Sans CJK SC']
+
+
+def check_fonts_installed():
+    """Check if required Chinese fonts are installed."""
+    system_type = platform.system()
+
+    if system_type == 'Windows':
+        # Windows: check fonts via registry or system font folder
+        font_dir = Path("C:/Windows/Fonts")
+        installed_fonts = [f.name for f in font_dir.glob("*.ttf")]
+        if any(font for font in REQUIRED_CHS_FONTS if any(font in f for f in installed_fonts)):
+            return True
+        raise EnvironmentError(
+            f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
+        )
+    else:
+        # Linux/macOS: use fc-list
+        try:
+            output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
+            for font in REQUIRED_CHS_FONTS:
+                if font in output:
+                    return True
+            raise EnvironmentError(
+                f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
+            )
+        except Exception as e:
+            raise EnvironmentError(f"Font detection failed. Please install 'fontconfig' and fonts: {str(e)}")
+
+
+def get_soffice_command():
+    """Return the path to LibreOffice's soffice executable depending on the platform."""
+    system_type = platform.system()
+
+    # First check if soffice is in PATH
+    soffice_path = shutil.which('soffice')
+    if soffice_path:
+        return soffice_path
+
+    if system_type == 'Windows':
+        # Check common installation paths
+        possible_paths = [
+            Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe',
+            Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe',
+            Path('C:/Program Files/LibreOffice/program/soffice.exe'),
+            Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe')
+        ]
+
+        # Check other drives for windows
+        for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']:
+            possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe"))
+
+        for path in possible_paths:
+            if path.exists():
+                return str(path)
+
+        raise ConvertToPdfError(
+            "LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
+            "or ensure soffice.exe is in your PATH environment variable."
+        )
+    else:
+        # For Linux/macOS, provide installation instructions if not found
+        try:
+            # Try to find soffice in standard locations
+            possible_paths = [
+                '/usr/bin/soffice',
+                '/usr/local/bin/soffice',
+                '/opt/libreoffice/program/soffice',
+                '/Applications/LibreOffice.app/Contents/MacOS/soffice'
+            ]
+            for path in possible_paths:
+                if os.path.exists(path):
+                    return path
+
+            raise ConvertToPdfError(
+                "LibreOffice not found. Please install it:\n"
+                "  - Ubuntu/Debian: sudo apt-get install libreoffice\n"
+                "  - CentOS/RHEL: sudo yum install libreoffice\n"
+                "  - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n"
+                "  - Or ensure soffice is in your PATH environment variable."
+            )
+        except Exception as e:
+            raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}")
+
+
 def convert_file_to_pdf(input_path, output_dir):
+    """Convert a single document (ppt, doc, etc.) to PDF."""
    if not os.path.isfile(input_path):
        raise FileNotFoundError(f"The input file {input_path} does not exist.")

    os.makedirs(output_dir, exist_ok=True)
-    
+
+    check_fonts_installed()
+
+    soffice_cmd = get_soffice_command()
+
    cmd = [
-        'soffice',
+        soffice_cmd,
        '--headless',
+        '--norestore',
+        '--invisible',
        '--convert-to', 'pdf',
        '--outdir', str(output_dir),
        str(input_path)
    ]
-    
+
    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    
+
    if process.returncode != 0:
-        raise ConvertToPdfError(process.stderr.decode())
+        raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}")
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
                     "matplotlib>=3.10,<4",
                     "ultralytics>=8.3.48,<9",  # yolov8,公式检测
                     "doclayout_yolo==0.0.2b1",  # doclayout_yolo
-                     "dill>=0.3.9,<1",  # doclayout_yolo
+                     "dill>=0.3.8,<1",  # doclayout_yolo
                     "rapid_table>=1.0.5,<2.0.0",  # rapid_table
                     "PyYAML>=6.0.2,<7",  # yaml
                     "ftfy>=6.3.1,<7",  # unimernet_hf
@@ -56,7 +56,7 @@ if __name__ == '__main__':
                    "matplotlib>=3.10,<=3.10.1",
                    "ultralytics>=8.3.48,<=8.3.104",  # yolov8,公式检测
                    "doclayout_yolo==0.0.2b1",  # doclayout_yolo
-                    "dill==0.3.9",  # doclayout_yolo
+                    "dill==0.3.8",  # doclayout_yolo
                    "PyYAML==6.0.2",  # yaml
                    "ftfy==6.3.1",  # unimernet_hf
                    "openai==1.71.0",  # openai SDK
--- a/signatures/version1/cla.json
+++ b/signatures/version1/cla.json
@@ -223,6 +223,22 @@
      "created_at": "2025-03-24T12:58:56Z",
      "repoId": 765083837,
      "pullRequestNo": 1982
+    },
+    {
+      "name": "zjx20",
+      "id": 2639200,
+      "comment_id": 2800714918,
+      "created_at": "2025-04-14T07:25:26Z",
+      "repoId": 765083837,
+      "pullRequestNo": 2215
+    },
+    {
+      "name": "Doge2077",
+      "id": 91442300,
+      "comment_id": 2801283257,
+      "created_at": "2025-04-14T10:40:54Z",
+      "repoId": 765083837,
+      "pullRequestNo": 2226
    }
  ]
 }
--- a/tests/unittest/test_table/test_tablemaster.py
+++ b/tests/unittest/test_table/test_tablemaster.py
@@ -2,31 +2,34 @@ import unittest
 from PIL import Image
 from lxml import etree

-from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
+from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
+from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel


 class TestppTableModel(unittest.TestCase):
    def test_image2html(self):
-        img = Image.open("tests/unittest/test_table/assets/table.jpg")
-        # 修改table模型路径
-        config = {"device": "cuda",
-                  "model_dir": "/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"}
-        table_model = TableMasterPaddleModel(config)
-        res = table_model.img2html(img)
+        img = Image.open("assets/table.jpg")
+        atom_model_manager = AtomModelSingleton()
+        ocr_engine = atom_model_manager.get_atom_model(
+            atom_model_name='ocr',
+            ocr_show_log=False,
+            det_db_box_thresh=0.5,
+            det_db_unclip_ratio=1.6,
+            lang='ch'
+        )
+        table_model = RapidTableModel(ocr_engine, 'slanet_plus')
+        html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(img)
        # 验证生成的 HTML 是否符合预期
        parser = etree.HTMLParser()
-        tree = etree.fromstring(res, parser)
+        tree = etree.fromstring(html_code, parser)

        # 检查 HTML 结构
        assert tree.find('.//table') is not None, "HTML should contain a <table> element"
-        assert tree.find('.//thead') is not None, "HTML should contain a <thead> element"
-        assert tree.find('.//tbody') is not None, "HTML should contain a <tbody> element"
        assert tree.find('.//tr') is not None, "HTML should contain a <tr> element"
        assert tree.find('.//td') is not None, "HTML should contain a <td> element"

        # 检查具体的表格内容
-        headers = tree.xpath('//thead/tr/td/b')
-        print(headers)  # Print headers for debugging
+        headers = tree.xpath('//table/tr[1]/td')
        assert len(headers) == 5, "Thead should have 5 columns"
        assert headers[0].text and headers[0].text.strip() == "Methods", "First header should be 'Methods'"
        assert headers[1].text and headers[1].text.strip() == "R", "Second header should be 'R'"
@@ -35,7 +38,7 @@ class TestppTableModel(unittest.TestCase):
        assert headers[4].text and headers[4].text.strip() == "FPS", "Fifth header should be 'FPS'"

        # 检查第一行数据
-        first_row = tree.xpath('//tbody/tr[1]/td')
+        first_row = tree.xpath('//table/tr[2]/td')
        assert len(first_row) == 5, "First row should have 5 cells"
        assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'"
        assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'"
@@ -44,14 +47,13 @@ class TestppTableModel(unittest.TestCase):
        assert first_row[4].text and first_row[4].text.strip() == "8.9", "Fifth cell should be '8.9'"

        # 检查倒数第二行数据
-        second_last_row = tree.xpath('//tbody/tr[position()=last()-1]/td')
+        second_last_row = tree.xpath('//table/tr[position()=last()-1]/td')
        assert len(second_last_row) == 5, "second_last_row should have 5 cells"
-        assert second_last_row[0].text and second_last_row[
-            0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
+        assert second_last_row[0].text and second_last_row[0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
        assert second_last_row[1].text and second_last_row[1].text.strip() == "80.68", "Second cell should be '80.68'"
        assert second_last_row[2].text and second_last_row[2].text.strip() == "85.40", "Third cell should be '85.40'"
-        assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
-        assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
+        # assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
+        # assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"


 if __name__ == "__main__":
Author	SHA1	Message	Date
Xiaomeng Zhao	0222293f64	Merge pull request #2254 from opendatalab/dev Dev	2025-04-16 17:59:02 +08:00
Xiaomeng Zhao	16f176ea65	Merge pull request #2253 from myhloli/dev docs(README): update changelog for v1.3.4 release	2025-04-16 17:58:29 +08:00
myhloli	1705958f65	docs(README): update changelog for v1.3.4 release - Update README.md and README_zh-CN.md with the latest changes - Add new release notes for version 1.3.4 - Include improvements in OCR detection speed and page-level sorting	2025-04-16 17:57:17 +08:00
Xiaomeng Zhao	2de5a79f52	Merge pull request #2251 from myhloli/dev feat(pdf_parse): add footnote block handling in layout split	2025-04-16 17:49:52 +08:00
myhloli	058d318491	feat(pdf_parse): add footnote block handling in layout split - Modify `ocr_detect_all_bboxes.py` to return footnote blocks - Update `pdf_parse_union_core_v2.py` to handle footnote blocks in line sorting and layout splitting - This change improves the accuracy of layout analysis by considering footnote blocks separately	2025-04-16 17:48:30 +08:00
Xiaomeng Zhao	cfa90743b5	Merge pull request #2250 from myhloli/dev test(table): update unit test to use RapidTable model	2025-04-16 17:07:23 +08:00
myhloli	b36b469a1c	test(table): update unit test to use RapidTable model - Rename test file from test_tablemaster.py to test_rapidtable.py - Replace TableMasterPaddleModel with RapidTableModel - Update test case to use new model and adjust assertions accordingly - Remove some outdated assertions and comments	2025-04-16 16:54:27 +08:00
Xiaomeng Zhao	40bfd7acce	Merge pull request #2240 from myhloli/dev feat(model): add text region handling and improve overlap resolution	2025-04-15 19:31:05 +08:00
Xiaomeng Zhao	b7ff7ded64	Merge pull request #9 from myhloli/refactor-pipeline feat(model): add text region handling and improve overlap resolution	2025-04-15 19:30:06 +08:00
myhloli	07edefaa7d	feat(model): add text region handling and improve overlap resolution - Add text region handling in get_res_list_from_layout_res function - Implement remove_overlaps_min_blocks function to handle overlapping blocks - Update OCR region handling to include text regions - Improve overlap resolution for all regions in layout results	2025-04-15 19:28:29 +08:00
Xiaomeng Zhao	24b7e7ca36	Merge pull request #2226 from Doge2077/master fix:Chinese Character Garbling in PPTX/DOCX Conversion by Adding Font Check and Installation	2025-04-15 11:09:32 +08:00
Doge2077	87440ba43c	fix:remove duplicate code	2025-04-15 10:33:54 +08:00
Xiaomeng Zhao	ff35c75531	Merge pull request #2234 from myhloli/dev build(docker): add torch and torchvision installation	2025-04-15 10:26:25 +08:00
myhloli	5ddd6799aa	build(docker): add torch and torchvision installation - Add pip install command for torch and torchvision - Specify version2.3.1 for both packages - Use Aliyun mirror for faster download	2025-04-15 09:55:57 +08:00
Doge2077	039f8cbfde	feat:add advice on LibreOffice installing	2025-04-14 20:21:37 +08:00
Xiaomeng Zhao	73ccfbbfbe	Merge pull request #8 from myhloli/dev Dev	2025-04-14 19:19:21 +08:00
Xiaomeng Zhao	410d0afc81	Merge pull request #2227 from opendatalab/master master->dev	2025-04-14 19:03:22 +08:00
github-actions[bot]	c774a4dde1	@Doge2077 has signed the CLA in opendatalab/MinerU#2226	2025-04-14 10:41:06 +00:00
myhloli	29b47466ff	Update version.py with new version	2025-04-14 10:34:29 +00:00
Xiaomeng Zhao	a1df670e34	Merge pull request #2225 from opendatalab/release-1.3.3 Release 1.3.3	2025-04-14 18:33:07 +08:00
Xiaomeng Zhao	a67de492b1	Merge pull request #2224 from opendatalab/dev build(deps): downgrade dill to 0.3.8 for doclayout_yolo compatibility	2025-04-14 18:31:49 +08:00
Xiaomeng Zhao	222af4f2f5	Merge pull request #2223 from myhloli/dev build(deps): downgrade dill to 0.3.8 for doclayout_yolo compatibility	2025-04-14 18:31:04 +08:00
myhloli	b9eed5d865	build(deps): downgrade dill to 0.3.8 for doclayout_yolo compatibility - Change dill dependency from >=0.3.9,<1 to >=0.3.8,<1 - Update dill version in both general and specific requirements	2025-04-14 18:29:47 +08:00
Doge2077	82a4376d8a	bugfix:While converting file to pdf, Chinese font will be ignored.	2025-04-14 17:51:56 +08:00
Xiaomeng Zhao	99ab04f588	Merge pull request #2220 from myhloli/refactor-pipeline fix(magic_pdf): correct range for images in document analysis	2025-04-14 17:30:45 +08:00
myhloli	67b31a78d0	fix(magic_pdf): correct range for images in document analysis - Update the range used to generate images_with_extra_info to match the number of images - This fixes a potential IndexError when the number of images differs from the dataset length	2025-04-14 17:24:58 +08:00
Xiaomeng Zhao	4f129a64aa	Merge pull request #7 from myhloli/dev refactor(footnote_detection): adjust footnote detection threshold	2025-04-14 16:30:32 +08:00
github-actions[bot]	47d287a2a0	@zjx20 has signed the CLA in opendatalab/MinerU#2215	2025-04-14 07:25:39 +00:00
Xiaomeng Zhao	bc51f9f75e	Merge pull request #2214 from myhloli/dev refactor(footnote_detection): adjust footnote detection threshold	2025-04-14 15:23:31 +08:00
myhloli	8caf59f7cb	refactor(footnote_detection): adjust footnote detection threshold - Change footnote detection threshold from 50% of page height to 30% - Improve accuracy of footnote identification in PDF processing	2025-04-14 15:16:33 +08:00
Xiaomeng Zhao	4df8523a31	Merge pull request #2208 from opendatalab/master master->dev	2025-04-13 21:53:37 +08:00
Xiaomeng Zhao	c7a609fa7a	Merge pull request #2207 from opendatalab/release-1.3.2 build(docker): remove requirements.txt and update package installation	2025-04-13 21:52:44 +08:00
myhloli	5957cb65f9	Update version.py with new version	2025-04-12 11:04:26 +00:00
Xiaomeng Zhao	d0ed731b9e	Merge pull request #2199 from opendatalab/release-1.3.2 Release 1.3.2	2025-04-12 18:58:15 +08:00
Xiaomeng Zhao	b60166a541	Merge pull request #2157 from opendatalab/release-1.3.1 Release 1.3.1	2025-04-08 18:16:33 +08:00
Xiaomeng Zhao	ccf2ea04cb	Merge pull request #2156 from opendatalab/dev Dev	2025-04-08 18:16:07 +08:00
Xiaomeng Zhao	cb9c2e7616	Merge pull request #2154 from opendatalab/release-1.3.2 Release 1.3.2	2025-04-08 18:11:26 +08:00