Merge pull request #1165 from opendatalab/release-0.10.5

Release 0.10.5
Merge pull request #1167 from opendatalab/dev
2026-03-27 11:08:32 +07:00 · 2024-12-02 11:44:51 +08:00 · 2024-12-02 11:44:16 +08:00 · 2024-12-02 11:41:57 +08:00 · 2024-12-02 11:32:54 +08:00 · 2024-12-02 11:27:22 +08:00
6 changed files with 8 additions and 8 deletions
--- a/magic_pdf/data/utils.py
+++ b/magic_pdf/data/utils.py
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
    mat = fitz.Matrix(dpi / 72, dpi / 72)
    pm = doc.get_pixmap(matrix=mat, alpha=False)

-    # If the width or height exceeds 9000 after scaling, do not scale further.
-    if pm.width > 9000 or pm.height > 9000:
+    # If the width or height exceeds 4500 after scaling, do not scale further.
+    if pm.width > 4500 or pm.height > 4500:
        pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

    img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
@@ -1 +1 @@
-__version__ = "0.10.3"
+__version__ = "0.10.4"
--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
@@ -112,8 +112,8 @@ def __is_list_or_index_block(block):
            line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
            block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
            if (
-                line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
-                and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
+                line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
+                and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
            ):
                external_sides_not_close_num += 1
            if abs(line_mid_x - block_mid_x) < line_height / 2:
--- a/magic_pdf/pre_proc/cut_image.py
+++ b/magic_pdf/pre_proc/cut_image.py
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
    for span in spans:
        span_type = span['type']
        if span_type == ContentType.Image:
-            if not check_img_bbox(span['bbox']):
+            if not check_img_bbox(span['bbox']) or not imageWriter:
                continue
            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
                                           imageWriter=imageWriter)
        elif span_type == ContentType.Table:
-            if not check_img_bbox(span['bbox']):
+            if not check_img_bbox(span['bbox']) or not imageWriter:
                continue
            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
                                           imageWriter=imageWriter)
--- a/projects/web_demo/web_demo/api/analysis/pdf_ext.py
+++ b/projects/web_demo/web_demo/api/analysis/pdf_ext.py
@@ -15,7 +15,7 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.pipe.UNIPipe import UNIPipe

-from ..extensions import app, db
+from ..extentions import app, db
 from .ext import find_file
 from .models import AnalysisPdf, AnalysisTask

--- a/projects/web_demo/web_demo/api/extentions.py
+++ b/projects/web_demo/web_demo/api/extentions.py
Author	SHA1	Message	Date
Xiaomeng Zhao	c175001d6a	Merge pull request #1165 from opendatalab/release-0.10.5 Release 0.10.5	2024-12-02 11:44:51 +08:00
Xiaomeng Zhao	a35785b9e5	Merge pull request #1167 from opendatalab/dev Dev -> 0.10.5	2024-12-02 11:44:16 +08:00
Xiaomeng Zhao	a7296f78f0	Merge pull request #1166 from myhloli/dev fix(pre_proc): prevent errors when imageWriter is None	2024-12-02 11:41:57 +08:00
Xiaomeng Zhao	ed822634df	Merge pull request #1164 from myhloli/dev refactor(para): adjust line height multiplier for block splitting,fix(pre_proc): prevent errors when imageWriter is None	2024-12-02 11:32:54 +08:00
myhloli	b0529b6fbd	fix: reduce maximum image size - Decrease the maximum width and height from 9000 to 4500 pixels - This change aims to prevent excessive resource usage when rendering PDFs	2024-12-02 11:27:22 +08:00
myhloli	7f8dc353b0	fix(pre_proc): prevent errors when imageWriter is None - Updated cut_image.py to check for NoneType imageWriter - Prevents AttributeError when imageWriter is not provided	2024-12-02 11:20:49 +08:00
Xiaomeng Zhao	384e03799c	Merge pull request #1156 from myhloli/dev refactor(para): adjust line height multiplier for block splitting	2024-12-01 02:06:17 +08:00
myhloli	41545a13c6	refactor(para): adjust line height multiplier for block splitting - Decrease the line height multiplier from 0.8 to 0.7 for both left and right sides - This modification aims to improve the accuracy of paragraph splitting	2024-12-01 02:05:49 +08:00
Xiaomeng Zhao	b17084febb	Merge pull request #1154 from LollipopsAndWine/dev	2024-11-30 17:05:52 +08:00
houlinfeng	f11f3d6032	fix: 修复文件名错误	2024-11-30 16:52:03 +08:00
myhloli	f8828be7eb	Update version.py with new version	2024-11-29 18:48:40 +00:00
Xiaomeng Zhao	b03a7fae5e	Merge pull request #1153 from opendatalab/release-0.10.4 Release 0.10.4	2024-11-30 02:47:28 +08:00