Merge pull request #1153 from opendatalab/release-0.10.4

Release 0.10.4
Merge pull request #1152 from myhloli/dev
2026-04-12 07:06:44 +07:00 · 2024-11-30 02:47:28 +08:00 · 2024-11-30 02:45:02 +08:00 · 2024-11-30 02:33:26 +08:00 · 2024-11-30 02:16:38 +08:00 · 2024-11-30 01:14:26 +08:00
14 changed files with 224 additions and 151 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -81,6 +81,7 @@ body:
        - "0.7.x"
        - "0.8.x"
        - "0.9.x"
+        - "0.10.x"
    validations:
      required: true

--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -1,53 +1 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

-name: mineru
-jobs:
-  cli-test:
-    runs-on: pdf
-    timeout-minutes: 240
-    strategy:
-      fail-fast: true
-
-    steps:
-    - name: PDF cli
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: install&test
-      run: |
-        source activate mineru
-        conda env list
-        pip show coverage
-        git checkout "dev"
-        # cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
-        cd $GITHUB_WORKSPACE && python tests/clean_coverage.py      
-        cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/  --cov-report html --cov-report term-missing
-        cd $GITHUB_WORKSPACE && python tests/get_coverage.py
-        cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
-
-  notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
-    needs: cli-test
-    runs-on: pdf
-    steps:
-    - name: get_actor
-      run: |
-          metion_list="dt-yy"
-          echo $GITHUB_ACTOR
-          if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
-            metion_list="xuchao"
-          elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
-            metion_list="zhaoxiaomeng"
-          elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
-            metion_list="xurui1"
-          fi
-          echo $metion_list
-          echo "METIONS=$metion_list" >> "$GITHUB_ENV"
-          echo ${{ env.METIONS }}
-
-    - name: notify
-      run: |
-        echo ${{ secrets.USER_ID }}
-        curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -30,6 +30,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
    for page_info in pdf_info_dict:
        paras_of_layout = page_info.get('para_blocks')
        if not paras_of_layout:
+            markdown_with_para_and_pagination.append({
+                'page_no':
+                    page_no,
+                'md_content':
+                    '',
+            })
+            page_no += 1
            continue
        page_markdown = ocr_mk_markdown_with_para_core_v2(
            paras_of_layout, 'mm', img_buket_path)
@@ -129,21 +136,19 @@ def __replace_ligatures(text: str):


 def merge_para_with_text(para_block):
+    block_text = ''
+    for line in para_block['lines']:
+        for span in line['spans']:
+            if span['type'] in [ContentType.Text]:
+                block_text += span['content']
+    block_lang = detect_lang(block_text)
+
    para_text = ''
    for i, line in enumerate(para_block['lines']):

        if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
            para_text += '  \n'

-        line_text = ''
-        line_lang = ''
-        for span in line['spans']:
-            span_type = span['type']
-            if span_type == ContentType.Text:
-                line_text += span['content'].strip()
-
-        if line_text != '':
-            line_lang = detect_lang(line_text)
        for j, span in enumerate(line['spans']):

            span_type = span['type']
@@ -156,20 +161,20 @@ def merge_para_with_text(para_block):
                content = f"\n$$\n{span['content']}\n$$\n"

            content = content.strip()
-            if content != '':
+
+            if content:
                langs = ['zh', 'ja', 'ko']
-                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    if span_type in [ContentType.Text, ContentType.InterlineEquation]:
-                        para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
-                    elif span_type == ContentType.InlineEquation:
-                        para_text += f' {content} '
+                # logger.info(f'block_lang: {block_lang}, content: {content}')
+                if block_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔
+                    if j == len(line['spans']) - 1:
+                        para_text += content
+                    else:
+                        para_text += f'{content} '
                else:
                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
-                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
+                        if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
                            para_text += content[:-1]
-                        elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
-                            para_text += content
                        else:  # 西方文本语境下 content间需要空格分隔
                            para_text += f'{content} '
                    elif span_type == ContentType.InterlineEquation:
@@ -177,7 +182,7 @@ def merge_para_with_text(para_block):
            else:
                continue
    # 连写字符拆分
-    para_text = __replace_ligatures(para_text)
+    # para_text = __replace_ligatures(para_text)

    return para_text

--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
@@ -8,7 +8,7 @@ from loguru import logger
 from magic_pdf.config.drop_reason import DropReason
 from magic_pdf.libs.commons import get_top_percent_list, mymax
 from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.pdf_check import detect_invalid_chars
+from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf

 scan_max_page = 50
 junk_limit_min = 10
@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document):

 def check_invalid_chars(pdf_bytes):
    """乱码检测."""
-    return detect_invalid_chars(pdf_bytes)
+    return detect_invalid_chars_by_pymupdf(pdf_bytes)


 def pdf_meta_scan(pdf_bytes: bytes):
--- a/magic_pdf/libs/pdf_check.py
+++ b/magic_pdf/libs/pdf_check.py
@@ -1,9 +1,9 @@
-from io import BytesIO
-import re
 import fitz
 import numpy as np
 from loguru import logger
-from pdfminer.high_level import extract_text
+# import re
+# from io import BytesIO
+# from pdfminer.high_level import extract_text


 def calculate_sample_count(total_page: int):
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
    return select_page_cnt


-def extract_pages(src_pdf_bytes: bytes):
+def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
    pdf_docs = fitz.open("pdf", src_pdf_bytes)
    total_page = len(pdf_docs)
    if total_page == 0:
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
    return sample_docs


-def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
-    """"
-    检测PDF中是否包含非法字符
+# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
+#     """"
+#     检测PDF中是否包含非法字符
+#     """
+#     '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
+#     sample_docs = extract_pages(src_pdf_bytes)
+#     sample_pdf_bytes = sample_docs.tobytes()
+#     sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
+#     text = extract_text(sample_pdf_file_like_object)
+#     text = text.replace("\n", "")
+#     # logger.info(text)
+#     '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
+#     cid_pattern = re.compile(r'\(cid:\d+\)')
+#     matches = cid_pattern.findall(text)
+#     cid_count = len(matches)
+#     cid_len = sum(len(match) for match in matches)
+#     text_len = len(text)
+#     if text_len == 0:
+#         cid_chars_radio = 0
+#     else:
+#         cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
+#     logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
+#     '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
+#     if cid_chars_radio > 0.05:
+#         return False  # 乱码文档
+#     else:
+#         return True   # 正常文档
+
+
+def count_replacement_characters(text: str) -> int:
    """
-    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
+    统计字符串中 0xfffd 字符的数量。
+    """
+    return text.count('\ufffd')
+
+
+def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
    sample_docs = extract_pages(src_pdf_bytes)
-    sample_pdf_bytes = sample_docs.tobytes()
-    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
-    text = extract_text(sample_pdf_file_like_object)
-    text = text.replace("\n", "")
-    # logger.info(text)
-    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
-    cid_pattern = re.compile(r'\(cid:\d+\)')
-    matches = cid_pattern.findall(text)
-    cid_count = len(matches)
-    cid_len = sum(len(match) for match in matches)
-    text_len = len(text)
+    doc_text = ""
+    for page in sample_docs:
+        page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
+        doc_text += page_text
+    text_len = len(doc_text)
+    uffd_count = count_replacement_characters(doc_text)
    if text_len == 0:
-        cid_chars_radio = 0
+        uffd_chars_radio = 0
    else:
-        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
-    logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
-    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
-    if cid_chars_radio > 0.05:
+        uffd_chars_radio = uffd_count / text_len
+    logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
+    '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
+    if uffd_chars_radio > 0.01:
        return False  # 乱码文档
    else:
-        return True   # 正常文档
+        return True   # 正常文档
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
@@ -1 +1 @@
-__version__ = "0.10.1"
+__version__ = "0.10.3"
--- a/magic_pdf/model/pp_structure_v2.py
+++ b/magic_pdf/model/pp_structure_v2.py
@@ -18,11 +18,31 @@ def region_to_bbox(region):


 class CustomPaddleModel:
-    def __init__(self, ocr: bool = False, show_log: bool = False, lang=None):
+    def __init__(self,
+                 ocr: bool = False,
+                 show_log: bool = False,
+                 lang=None,
+                 det_db_box_thresh=0.3,
+                 use_dilation=True,
+                 det_db_unclip_ratio=1.8
+    ):
        if lang is not None:
-            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang)
+            self.model = PPStructure(table=False,
+                                     ocr=True,
+                                     show_log=show_log,
+                                     lang=lang,
+                                     det_db_box_thresh=det_db_box_thresh,
+                                     use_dilation=use_dilation,
+                                     det_db_unclip_ratio=det_db_unclip_ratio,
+            )
        else:
-            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
+            self.model = PPStructure(table=False,
+                                     ocr=True,
+                                     show_log=show_log,
+                                     det_db_box_thresh=det_db_box_thresh,
+                                     use_dilation=use_dilation,
+                                     det_db_unclip_ratio=det_db_unclip_ratio,
+            )

    def __call__(self, img):
        try:
--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
@@ -1,9 +1,55 @@
+import cv2
 import numpy as np
 from loguru import logger
-
+from io import BytesIO
+from PIL import Image
+import base64
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
 from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line

+from ppocr.utils.utility import check_and_read
+
+
+def img_decode(content: bytes):
+    np_arr = np.frombuffer(content, dtype=np.uint8)
+    return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
+
+
+def check_img(img):
+    if isinstance(img, bytes):
+        img = img_decode(img)
+    if isinstance(img, str):
+        image_file = img
+        img, flag_gif, flag_pdf = check_and_read(image_file)
+        if not flag_gif and not flag_pdf:
+            with open(image_file, 'rb') as f:
+                img_str = f.read()
+                img = img_decode(img_str)
+            if img is None:
+                try:
+                    buf = BytesIO()
+                    image = BytesIO(img_str)
+                    im = Image.open(image)
+                    rgb = im.convert('RGB')
+                    rgb.save(buf, 'jpeg')
+                    buf.seek(0)
+                    image_bytes = buf.read()
+                    data_base64 = str(base64.b64encode(image_bytes),
+                                      encoding="utf-8")
+                    image_decode = base64.b64decode(data_base64)
+                    img_array = np.frombuffer(image_decode, np.uint8)
+                    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+                except:
+                    logger.error("error in loading image:{}".format(image_file))
+                    return None
+        if img is None:
+            logger.error("error in loading image:{}".format(image_file))
+            return None
+    if isinstance(img, np.ndarray) and len(img.shape) == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+    return img
+

 def bbox_to_points(bbox):
    """ 将bbox格式转换为四个顶点的数组 """
--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
@@ -1,15 +1,17 @@
 import copy
 import time
-
 import cv2
 import numpy as np
-from paddleocr import PaddleOCR
-from paddleocr.paddleocr import check_img, logger
-from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
-from paddleocr.tools.infer.predict_system import sorted_boxes
-from paddleocr.tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop

-from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes
+from paddleocr import PaddleOCR
+from ppocr.utils.logging import get_logger
+from ppocr.utils.utility import alpha_to_color, binarize_img
+from tools.infer.predict_system import sorted_boxes
+from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
+
+from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
+
+logger = get_logger()


 class ModifiedPaddleOCR(PaddleOCR):
--- a/magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
+++ b/magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
@@ -2,8 +2,8 @@ import os

 import cv2
 import numpy as np
-from paddleocr.ppstructure.table.predict_table import TableSystem
-from paddleocr.ppstructure.utility import init_args
+from ppstructure.table.predict_table import TableSystem
+from ppstructure.utility import init_args
 from PIL import Image

 from magic_pdf.config.constants import *  # noqa: F403
--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
@@ -1,7 +1,10 @@
 import copy

+from loguru import logger
+
 from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
+from magic_pdf.libs.language import detect_lang

 LINE_STOP_FLAG = (
    '.',
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):

            # 添加所有文本，包括空行，保持与block['lines']长度一致
            lines_text_list.append(line_text)
+            block_text = ''.join(lines_text_list)
+            block_lang = detect_lang(block_text)
+            # logger.info(f"block_lang: {block_lang}")

            # 计算line左侧顶格数量是否大于2，是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
            if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
            if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
                right_close_num += 1
            else:
-                # 右侧不顶格情况下是否有一段距离，拍脑袋用0.3block宽度做阈值
-                # block宽的阈值可以小些，block窄的阈值要大
-
-                if block_weight_radio >= 0.5:
+                # 类中文没有超长单词的情况，可以用统一的阈值
+                if block_lang in ['zh', 'ja', 'ko']:
                    closed_area = 0.26 * block_weight
                else:
-                    closed_area = 0.36 * block_weight
+                    # 右侧不顶格情况下是否有一段距离，拍脑袋用0.3block宽度做阈值
+                    # block宽的阈值可以小些，block窄的阈值要大
+                    if block_weight_radio >= 0.5:
+                        closed_area = 0.26 * block_weight
+                    else:
+                        closed_area = 0.36 * block_weight
                if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
                    right_not_close_num += 1

--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -30,22 +30,14 @@ try:
        torchtext.disable_torchtext_deprecation_warning()
 except ImportError:
    pass
+
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
-
 from magic_pdf.para.para_split_v3 import para_split
-
-from magic_pdf.pre_proc.construct_page_dict import \
-    ocr_construct_page_component_v2
+from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
 from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
-
-from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
-    ocr_prepare_bboxes_for_layout_split_v2
-from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
-                                               fix_block_spans_v2,
-                                               fix_discarded_block)
-from magic_pdf.pre_proc.ocr_span_list_modify import (
-    get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
-    remove_overlaps_min_spans)
+from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
+from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
+from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans


 def __replace_STX_ETX(text_str: str):
@@ -65,10 +57,18 @@ def __replace_STX_ETX(text_str: str):
    return text_str


+def __replace_0xfffd(text_str: str):
+    """Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
+    if text_str:
+        s = text_str.replace('\ufffd', " ")
+        return s
+    return text_str
+
 def chars_to_content(span):
    # 检查span中的char是否为空
    if len(span['chars']) == 0:
-        span['content'] = ''
+        pass
+        # span['content'] = ''
    else:
        # 先给chars按char['bbox']的中心点的x坐标排序
        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
@@ -83,22 +83,24 @@ def chars_to_content(span):
            if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
                content += ' '
            content += char['c']
-        span['content'] = __replace_STX_ETX(content)
+
+        span['content'] = __replace_0xfffd(content)

    del span['chars']


 LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；', ']', '】', '}', '}', '>', '》', '、', ',', '，', '-', '—', '–',)
+LINE_START_FLAG = ('(', '（', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
+
+
 def fill_char_in_spans(spans, all_chars):

+    # 简单从上到下排一下序
+    spans = sorted(spans, key=lambda x: x['bbox'][1])
+
    for char in all_chars:
        for span in spans:
-            # 判断char是否属于LINE_STOP_FLAG
-            if char['c'] in LINE_STOP_FLAG:
-                char_is_line_stop_flag = True
-            else:
-                char_is_line_stop_flag = False
-            if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
+            if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
                span['chars'].append(char)
                break

@@ -106,13 +108,16 @@ def fill_char_in_spans(spans, all_chars):

    for span in spans:
        chars_to_content(span)
-        if len(span['content']) == 0:
+        # 有的span中虽然没有字但有一两个空的占位符，用宽高和content长度过滤
+        if len(span['content']) * span['height'] < span['width'] * 0.5:
+            # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
            empty_spans.append(span)
+        del span['height'], span['width']
    return empty_spans


 # 使用鲁棒性更强的中心点坐标判断
-def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
+def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
    char_center_x = (char_bbox[0] + char_bbox[2]) / 2
    char_center_y = (char_bbox[1] + char_bbox[3]) / 2
    span_center_y = (span_bbox[1] + span_bbox[3]) / 2
@@ -121,18 +126,26 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
    if (
        span_bbox[0] < char_center_x < span_bbox[2]
        and span_bbox[1] < char_center_y < span_bbox[3]
-        and abs(char_center_y - span_center_y) < span_height / 4  # 字符的中轴和span的中轴高度差不能超过1/4span高度
+        and abs(char_center_y - span_center_y) < span_height * span_height_radio  # 字符的中轴和span的中轴高度差不能超过1/4span高度
    ):
        return True
    else:
        # 如果char是LINE_STOP_FLAG，就不用中心点判定，换一种方案（左边界在span区域内，高度判定和之前逻辑一致）
        # 主要是给结尾符号一个进入span的机会，这个char还应该离span右边界较近
-        if char_is_line_stop_flag:
+        if char in LINE_STOP_FLAG:
            if (
                (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
                and char_center_x > span_bbox[0]
                and span_bbox[1] < char_center_y < span_bbox[3]
-                and abs(char_center_y - span_center_y) < span_height / 4
+                and abs(char_center_y - span_center_y) < span_height * span_height_radio
+            ):
+                return True
+        elif char in LINE_START_FLAG:
+            if (
+                span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
+                and char_center_x < span_bbox[2]
+                and span_bbox[1] < char_center_y < span_bbox[3]
+                and abs(char_center_y - span_center_y) < span_height * span_height_radio
            ):
                return True
        else:
@@ -141,12 +154,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):

 def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):

-    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']

-    # @todo: 拿到char之后把倾斜角度较大的先删一遍
    all_pymu_chars = []
    for block in text_blocks_raw:
        for line in block['lines']:
+            cosine, sine = line['dir']
+            if abs (cosine) < 0.9 or abs(sine) > 0.1:
+                continue
            for span in line['spans']:
                all_pymu_chars.extend(span['chars'])

@@ -157,6 +172,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
            continue
        span_height = span['bbox'][3] - span['bbox'][1]
        span['height'] = span_height
+        span['width'] = span['bbox'][2] - span['bbox'][0]
        span_height_list.append(span_height)
    if len(span_height_list) == 0:
        return spans
@@ -174,15 +190,13 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
            if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
                continue
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
-                if span['height'] > median_span_height * 3 and span['height'] > (span['bbox'][2] - span['bbox'][0]) * 3:
+                if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
                    vertical_spans.append(span)
                elif block in all_bboxes:
                    useful_spans.append(span)
                else:
                    unuseful_spans.append(span)

-                del span['height']
-
                break

    """垂直的span框直接用pymu的line进行填充"""
@@ -232,6 +246,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
            if ocr_res and len(ocr_res) > 0:
                if len(ocr_res[0]) > 0:
                    ocr_text, ocr_score = ocr_res[0][0]
+                    # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
                    if ocr_score > 0.5 and len(ocr_text) > 0:
                        span['content'] = ocr_text
                        span['score'] = ocr_score
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    """将剩余的bbox做分离处理，防止后面分layout时出错"""
-    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
-
+    # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+    all_bboxes.sort(key=lambda x: x[0]+x[1])
    return all_bboxes, all_discarded_blocks


--- a/requirements.txt
+++ b/requirements.txt
@@ -4,10 +4,10 @@ click>=8.1.7
 fast-langdetect==0.2.0
 loguru>=0.6.0
 numpy>=1.21.6,<2.0.0
-pdfminer.six==20231228
 pydantic>=2.7.2,<2.8.0
 PyMuPDF>=1.24.9
 scikit-learn>=1.0.2
 torch>=2.2.2,<=2.3.1
 transformers
+# pdfminer.six==20231228
 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
Author	SHA1	Message	Date
Xiaomeng Zhao	b03a7fae5e	Merge pull request #1153 from opendatalab/release-0.10.4 Release 0.10.4	2024-11-30 02:47:28 +08:00
Xiaomeng Zhao	9726403c69	Merge pull request #1152 from myhloli/dev fix(mkcontent): optimize paragraph text merging and language detection	2024-11-30 02:45:02 +08:00
myhloli	b3127233f0	refactor: modify bbox processing for layout separation - Remove overlap between bboxes for block separation - Sort bboxes by combined x and y coordinates for better layout handling - Comment out previous overlap removal function	2024-11-30 02:33:26 +08:00
myhloli	b80befe9cf	refactor(mkcontent): optimize paragraph text merging and language detection - Extract language detection to block level instead of line level - Improve logic for handling Chinese, Japanese, and Korean languages - Refactor code for better readability and performance - Optimize handling of hyphenated words at line ends	2024-11-30 02:16:38 +08:00
myhloli	ea35fa6b60	Merge remote-tracking branch 'origin/dev' into dev	2024-11-30 01:14:26 +08:00
myhloli	c8cabb3cf6	feat(ocr_mkcontent): add language detection for line spacing - Introduce language detection to determine line spacing based on language context - Implement different spacing rules for Chinese/Japanese/Korean and Western texts - Adjust span content handling based on detected language and span type	2024-11-30 01:14:12 +08:00
Xiaomeng Zhao	78c9014073	Merge pull request #1147 from opendatalab/master master->dev	2024-11-29 16:44:47 +08:00
myhloli	d19911f113	Update version.py with new version	2024-11-29 08:03:01 +00:00
Xiaomeng Zhao	b3fbedf055	Merge pull request #1143 from opendatalab/release-0.10.3 Release 0.10.3	2024-11-29 16:01:36 +08:00
Xiaomeng Zhao	66bd0f8b69	Merge pull request #1141 from myhloli/dev refactor(ocr): Fix the error of paddleocr failing to initialize in a multi-threaded environment	2024-11-29 12:03:48 +08:00
myhloli	7f2f2c0f28	refactor(ocr): Fix the error of paddleocr failing to initialize in a multi-threaded environment	2024-11-29 12:02:48 +08:00
Xiaomeng Zhao	68c455309d	Merge pull request #1140 from myhloli/dev refactor(pdf_parse): adjust character-axis alignment algorithm	2024-11-29 12:00:36 +08:00
myhloli	d4345b6e39	refactor(pdf_parse): adjust character-axis alignment algorithm - Introduce `span_height_radio` parameter to calculate_char_in_span function - Replace fixed ratio with dynamic ratio for character and span axis alignment - Improve flexibility and accuracy of character placement within spans	2024-11-29 11:59:52 +08:00
Xiaomeng Zhao	086b48b7ae	Merge pull request #1139 from myhloli/dev fix(ocr_mkcontent): handle empty paragraphs on pages	2024-11-29 11:59:03 +08:00
myhloli	782e6571bc	fix(ocr_mkcontent): handle empty paragraphs on pages - Add empty paragraph handling for pages with no content - Append an empty markdown object when a page has no paragraphs - Increment page number even if no content is present	2024-11-29 11:58:34 +08:00
Xiaomeng Zhao	4adabc37ac	Merge pull request #1138 from myhloli/dev feat(pdf_parse): add line start flag detection and optimize line stop flag logic	2024-11-28 23:13:47 +08:00
myhloli	949d0867fb	feat(pdf_parse): add line start flag detection and optimize line stop flag logic - Add LINE_START_FLAG tuple to identify starting flags of a line - Modify calculate_char_in_span function to handle both line start and stop flags - Remove redundant char_is_line_stop_flag variable and simplify logic - Improve line flag detection to enhance text extraction accuracy	2024-11-28 23:12:37 +08:00
Xiaomeng Zhao	a1cff28c74	Merge pull request #1137 from myhloli/dev refactor(pdf_check): improve character detection using PyMuPDF	2024-11-28 22:36:30 +08:00
myhloli	ac88815620	refactor(pdf_check): improve character detection using PyMuPDF - Replace pdfminer with PyMuPDF for character detection - Implement new method detect_invalid_chars_by_pymupdf - Update check_invalid_chars in pdf_meta_scan.py to use new method - Add __replace_0xfffd function in pdf_parse_union_core_v2.py to handle special characters - Remove unused imports and update requirements.txt	2024-11-28 22:34:23 +08:00
Xiaomeng Zhao	b4dfa0f92f	Merge pull request #1136 from myhloli/dev refactor(ocr): improve text processing and span handling	2024-11-28 19:39:28 +08:00
myhloli	88c0854a65	refactor(ocr): improve text processing and span handling - Remove unused language detection code - Simplify text content processing logic - Update span sorting and text extraction in pdf_parse_union_core_v2.py	2024-11-28 19:38:30 +08:00
Xiaomeng Zhao	c295587b9e	Merge pull request #1135 from myhloli/dev feat(pdf_parse): filter out skewed text lines	2024-11-28 18:53:06 +08:00
myhloli	37da8c44c4	feat(pdf_parse): filter out skewed text lines - Add direction filtering to ignore highly skewed text lines - Improve text extraction accuracy by focusing on non-skewed content	2024-11-28 18:52:18 +08:00
Xiaomeng Zhao	5ecafbfa7d	Merge pull request #1134 from myhloli/dev refactor(para): improve language detection and block splitting	2024-11-28 18:07:23 +08:00
myhloli	f674b8d413	refactor(para): improve language detection and block splitting - Add language detection for each block of text - Implement language-specific logic for right margin alignment - Introduce logging for debugging purposes	2024-11-28 18:06:17 +08:00
Xiaomeng Zhao	e22fa18b46	Merge pull request #1132 from myhloli/dev fix(Hybrid OCR):Enable Hybrid OCR for Empty Spans That Contain a Certain Number of Placeholders but No Actual Text	2024-11-28 15:34:00 +08:00
myhloli	08392d63a0	fix(Hybrid OCR):Enable Hybrid OCR for Empty Spans That Contain a Certain Number of Placeholders but No Actual Text	2024-11-28 15:29:42 +08:00
Xiaomeng Zhao	f09c1cd284	Merge pull request #1130 from myhloli/dev fix(lite_model): Adapt iite Mode to the Hybrid OCR Mode in Version 0.10	2024-11-28 15:27:52 +08:00
myhloli	9b4d77dcd4	fix(lite_model): Adapt iite Mode to the Hybrid OCR Mode in Version 0.10	2024-11-28 15:06:54 +08:00
Xiaomeng Zhao	89c7bd0419	Merge pull request #1121 from opendatalab/master master -> dev	2024-11-27 18:33:05 +08:00
myhloli	52ef1bc782	Update version.py with new version	2024-11-27 10:31:09 +00:00
Xiaomeng Zhao	8afff9aee8	Merge pull request #1120 from opendatalab/release-0.10.2 Release 0.10.2	2024-11-27 18:16:02 +08:00
yyy	4df1eb74fa	Update daily.yml	2024-11-26 12:49:29 +08:00
Xiaomeng Zhao	fcfaede87b	Update bug_report.yml	2024-11-25 14:39:59 +08:00