增加ocr模式的layout解析功能

修复一个span可能没有content导致的问题
ocr拼接逻辑更新
2026-04-12 15:29:03 +07:00 · 2024-03-07 20:41:41 +08:00 · 2024-03-07 16:15:14 +08:00 · 2024-03-07 16:04:16 +08:00 · 2024-03-06 18:03:55 +08:00 · 2024-03-06 17:48:29 +08:00
19 changed files with 465 additions and 19 deletions
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -20,21 +20,29 @@ jobs:
        python-version: ["3.10"]

    steps:
-    - uses: actions/checkout@v4
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
+
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
    - name: Install wheel
      run: |
        python -m pip install wheel
+
    - name: Build wheel
      run: |
        python setup.py bdist_wheel
+
    - name: Upload artifact
      uses: actions/upload-artifact@v4
      with:
--- a/README.md
+++ b/README.md
@@ -19,6 +19,17 @@ python 3.9+
 git clone https://github.com/myhloli/Magic-PDF.git
 ```

+2.Install the requirements
+
+```sh
+pip install -r requirements.txt
+```
+
+3.Run the main script
+
+```sh
+use demo/demo_test.py
+```

 ### 版权说明

--- a/demo/download.py
+++ b/demo/download.py
@@ -2,7 +2,7 @@ import json
 import os
 from tqdm import tqdm

-from magic_pdf.libs import join_path
+from magic_pdf.libs.commons import join_path

 with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
    samples = json.load(f)
--- a/demo/draw_bbox.py
+++ b/demo/draw_bbox.py
@@ -1,4 +1,4 @@
-from magic_pdf.libs import fitz  # PyMuPDF
+from magic_pdf.libs.commons import fitz  # PyMuPDF

 # PDF文件路径
 pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
--- a/demo/ocr_demo.py
+++ b/demo/ocr_demo.py
@@ -0,0 +1,39 @@
+import json
+import os
+
+from loguru import logger
+
+from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
+
+
+def save_markdown(markdown_text, input_filepath):
+    # 获取输入文件的目录
+    directory = os.path.dirname(input_filepath)
+    # 获取输入文件的文件名（不带扩展名）
+    base_name = os.path.basename(input_filepath)
+    file_name_without_ext = os.path.splitext(base_name)[0]
+    # 定义输出文件的路径
+    output_filepath = os.path.join(directory, f"{file_name_without_ext}.md")
+
+    # 将Markdown文本写入.md文件
+    with open(output_filepath, 'w', encoding='utf-8') as file:
+        file.write(markdown_text)
+
+
+def read_json_file(file_path):
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return data
+
+
+if __name__ == '__main__':
+    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_1(3).json"
+    try:
+        ocr_pdf_info = read_json_file(ocr_json_file_path)
+        pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
+        markdown_text = mk_nlp_markdown(pdf_info_dict)
+        logger.info(markdown_text)
+        save_markdown(markdown_text, ocr_json_file_path)
+    except Exception as e:
+        logger.error(e)
--- a/demo/pdf2md.py
+++ b/demo/pdf2md.py
@@ -5,7 +5,7 @@ from pathlib import Path
 import click
 from loguru import logger

-from magic_pdf.libs import join_path
+from magic_pdf.libs.commons import join_path
 from magic_pdf.dict2md.mkcontent import mk_mm_markdown
 from magic_pdf.pipeline import parse_pdf_by_model

--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -0,0 +1,23 @@
+def mk_nlp_markdown(pdf_info_dict: dict):
+
+    markdown = []
+
+    for _, page_info in pdf_info_dict.items():
+        blocks = page_info.get("preproc_blocks")
+        if not blocks:
+            continue
+        for block in blocks:
+            for line in block['lines']:
+                line_text = ''
+                for span in line['spans']:
+                    if not span.get('content'):
+                        continue
+                    content = span['content'].replace('$', '\$')  # 转义$
+                    if span['type'] == 'inline_equation':
+                        content = f"${content}$"
+                    elif span['type'] == 'displayed_equation':
+                        content = f"$$\n{content}\n$$"
+                    line_text += content + ' '
+                # 在行末添加两个空格以强制换行
+                markdown.append(line_text.strip() + '  ')
+    return '\n'.join(markdown)
--- a/magic_pdf/libs/boxbase.py
+++ b/magic_pdf/libs/boxbase.py
@@ -119,6 +119,20 @@ def _is_left_overlap(box1, box2,):
    return x0_1<=x0_2<=x1_1 and vertical_overlap_cond


+def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
+    """检查两个bbox在y轴上是否有重叠，并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
+    _, y0_1, _, y1_1 = bbox1
+    _, y0_2, _, y1_2 = bbox2
+
+    overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
+    height1, height2 = y1_1 - y0_1, y1_2 - y0_2
+    max_height = max(height1, height2)
+    min_height = min(height1, height2)
+
+    return (overlap / min_height) > overlap_ratio_threshold
+
+
+
 def calculate_iou(bbox1, bbox2):
    # Determine the coordinates of the intersection rectangle
    x_left = max(bbox1[0], bbox2[0])
@@ -163,7 +177,25 @@ def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
    else:
        return intersection_area / min_box_area

-    
+
+def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
+    """
+    通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
+    如果比例大于ratio，则返回小的那个bbox,
+    否则返回None
+    """
+    x1_min, y1_min, x1_max, y1_max = bbox1
+    x2_min, y2_min, x2_max, y2_max = bbox2
+    area1 = (x1_max - x1_min) * (y1_max - y1_min)
+    area2 = (x2_max - x2_min) * (y2_max - y2_min)
+    overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+    if overlap_ratio > ratio and area1 < area2:
+        return bbox1
+    elif overlap_ratio > ratio and area2 < area1:
+        return bbox2
+    else:
+        return None
+
 def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
    x0, y0, x1, y1 = boundry
    new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]
--- a/magic_pdf/pdf_parse_by_model.py
+++ b/magic_pdf/pdf_parse_by_model.py
@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
 from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
 from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
 from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
+from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval

 denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
 titleDetectionException_msg = TitleDetectionException().message
@@ -446,6 +447,10 @@ def parse_pdf_by_model(
    ==================================================================================================================================
    进入段落处理-2阶段
    """
+
+    # 处理行内文字间距较大问题
+    pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict)
+    
    start_time = time.time()

    para_process_pipeline = ParaProcessPipeline()
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -0,0 +1,93 @@
+from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
+from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
+
+
+def construct_page_component(page_id, blocks, layout_bboxes):
+    return_dict = {
+        'preproc_blocks': blocks,
+        'page_idx': page_id,
+        'layout_bboxes': layout_bboxes,
+    }
+    return return_dict
+
+
+def parse_pdf_by_ocr(
+    ocr_pdf_info,
+    start_page_id=0,
+    end_page_id=None,
+):
+
+    pdf_info_dict = {}
+    end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
+    for page_id in range(start_page_id, end_page_id + 1):
+        ocr_page_info = ocr_pdf_info[page_id]
+        layout_dets = ocr_page_info['layout_dets']
+        spans = []
+        for layout_det in layout_dets:
+            category_id = layout_det['category_id']
+            allow_category_id_list = [1, 7, 13, 14, 15]
+            if category_id in allow_category_id_list:
+                x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
+                bbox = [int(x0), int(y0), int(x1), int(y1)]
+                '''要删除的'''
+                #  3: 'header',      # 页眉
+                #  4: 'page number', # 页码
+                #  5: 'footnote',    # 脚注
+                #  6: 'footer',      # 页脚
+                '''当成span拼接的'''
+                #  1: 'image', # 图片
+                #  7: 'table',       # 表格
+                #  13: 'inline_equation',     # 行内公式
+                #  14: 'displayed_equation',      # 行间公式
+                #  15: 'text',      # ocr识别文本
+                '''layout信息'''
+                #  11: 'full column',   # 单栏
+                #  12: 'sub column',    # 多栏
+                span = {
+                    'bbox': bbox,
+                }
+                if category_id == 1:
+                    span['type'] = 'image'
+                elif category_id == 7:
+                    span['type'] = 'table'
+                elif category_id == 13:
+                    span['content'] = layout_det['latex']
+                    span['type'] = 'inline_equation'
+                elif category_id == 14:
+                    span['content'] = layout_det['latex']
+                    span['type'] = 'displayed_equation'
+                elif category_id == 15:
+                    span['content'] = layout_det['text']
+                    span['type'] = 'text'
+                # print(span)
+                spans.append(span)
+            else:
+                continue
+
+        # 删除重叠spans中较小的那些
+        spans = remove_overlaps_min_spans(spans)
+
+        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
+
+
+        # 将spans合并成line(从上到下,从左到右)
+        lines = merge_spans_to_line(spans)
+        # logger.info(lines)
+
+        # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
+        blocks = []
+        for line in lines:
+            blocks.append({
+                "bbox": line['bbox'],
+                "lines": [line],
+            })
+
+        # 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
+        layout_bboxes = layout_detect(ocr_page_info['subfield_dets'])
+
+        # 构造pdf_info_dict
+        page_info = construct_page_component(page_id, blocks, layout_bboxes)
+        pdf_info_dict[f"page_{page_id}"] = page_info
+
+    return pdf_info_dict
+
--- a/magic_pdf/pre_proc/detect_equation.py
+++ b/magic_pdf/pre_proc/detect_equation.py
@@ -1,4 +1,4 @@
-from magic_pdf.libs.boxbase import _is_in               # 正则
+from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio              # 正则
 from magic_pdf.libs.commons import fitz             # pyMuPDF库


@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list):
                dump_list.append(all_bbox_list[i])
            elif _is_in(bbox2, bbox1):
                dump_list.append(all_bbox_list[j])
-    
+            else:
+                ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+                if ratio > 0.7:
+                    s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) 
+                    s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+                    if s2 > s1:  
+                        dump_list.append(all_bbox_list[i])
+                    else:
+                        dump_list.append(all_bbox_list[i]) 
+
    # 遍历需要删除的列表中的每个元素
    for item in dump_list:
        
--- a/magic_pdf/pre_proc/ocr_detect_layout.py
+++ b/magic_pdf/pre_proc/ocr_detect_layout.py
@@ -0,0 +1,123 @@
+from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
+
+def get_center_point(bbox):
+    """
+    根据边界框坐标信息，计算出该边界框的中心点坐标。
+    Args:
+        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
+    Returns:
+        list: 中心点坐标信息，包含两个元素，分别为x坐标和y坐标。
+    """
+    return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+
+
+def get_area(bbox):
+    """
+    根据边界框坐标信息，计算出该边界框的面积。
+    Args:
+        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
+    Returns:
+        float: 该边界框的面积。
+    """
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+
+
+def adjust_layouts(layout_bboxes):
+    # 遍历所有布局框
+    for i in range(len(layout_bboxes)):
+        # 遍历当前布局框之后的布局框
+        for j in range(i + 1, len(layout_bboxes)):
+            # 判断两个布局框是否重叠
+            if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
+                # 计算每个布局框的中心点坐标和面积
+                center_i = get_center_point(layout_bboxes[i]["layout_bbox"])
+                area_i = get_area(layout_bboxes[i]["layout_bbox"])
+
+                center_j = get_center_point(layout_bboxes[j]["layout_bbox"])
+                area_j = get_area(layout_bboxes[j]["layout_bbox"])
+
+                # 计算横向和纵向的距离差
+                dx = abs(center_i[0] - center_j[0])
+                dy = abs(center_i[1] - center_j[1])
+
+                # 较大布局框和较小布局框的赋值
+                if area_i > area_j:
+                    larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
+                else:
+                    larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
+
+                # 根据距离差判断重叠方向并修正边界
+                if dx > dy:  # 左右重叠
+                    if larger_layout["layout_bbox"][0] < smaller_layout["layout_bbox"][2]:
+                        larger_layout["layout_bbox"][0] = smaller_layout["layout_bbox"][2]
+                    else:
+                        larger_layout["layout_bbox"][2] = smaller_layout["layout_bbox"][0]
+                else:  # 上下重叠
+                    if larger_layout["layout_bbox"][1] < smaller_layout["layout_bbox"][3]:
+                        larger_layout["layout_bbox"][1] = smaller_layout["layout_bbox"][3]
+                    else:
+                        larger_layout["layout_bbox"][3] = smaller_layout["layout_bbox"][1]
+
+    # 返回排序调整后的布局边界框列表
+    return layout_bboxes
+
+
+
+
+def layout_detect(layout_info):
+    """
+    对输入的布局信息进行解析，提取出每个子布局的边界框，并对所有子布局进行排序调整。
+
+    Args:
+        layout_info (list): 包含子布局信息的列表，每个子布局信息为字典类型，包含'poly'字段，表示子布局的边界框坐标信息。
+
+    Returns:
+        list: 经过排序调整后的所有子布局边界框信息的列表，每个边界框信息为字典类型，包含'layout_bbox'字段，表示边界框的坐标信息。
+
+    """
+    # 初始化布局边界框列表
+    layout_bboxes = []
+    # 遍历每个子布局
+    for sub_layout in layout_info:
+        # 提取子布局的边界框坐标信息
+        x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
+        # 创建子布局的边界框字典
+        layout_bbox = {
+            "layout_bbox": [x0, y0, x1, y1],
+        }
+        # 将子布局的边界框添加到列表中
+        layout_bboxes.append(layout_bbox)
+
+    # 初始化新的布局边界框列表
+    new_layout_bboxes = []
+    # 遍历每个布局边界框
+    for i in range(len(layout_bboxes)):
+        # 初始化标记变量，用于判断当前边界框是否需要保留
+        keep = True
+        # 获取当前边界框的坐标信息
+        box_i = layout_bboxes[i]["layout_bbox"]
+
+        # 遍历其他边界框
+        for j in range(len(layout_bboxes)):
+            # 排除当前边界框自身
+            if i != j:
+                # 获取其他边界框的坐标信息
+                box_j = layout_bboxes[j]["layout_bbox"]
+                # 检测box_i是否被box_j包含
+                if _is_in(box_i, box_j):
+                    # 如果当前边界框被其他边界框包含，则标记为不需要保留
+                    keep = False
+                    # 跳出内层循环
+                    break
+
+        # 如果当前边界框需要保留，则添加到新的布局边界框列表中
+        if keep:
+            new_layout_bboxes.append(layout_bboxes[i])
+
+    # 对新的布局边界框列表进行排序调整
+    layout_bboxes = adjust_layouts(new_layout_bboxes)
+
+    # 返回排序调整后的布局边界框列表
+    return layout_bboxes
+
+
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -0,0 +1,60 @@
+from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
+
+
+# 删除重叠spans中较小的那些
+def remove_overlaps_min_spans(spans):
+    for span1 in spans.copy():
+        for span2 in spans.copy():
+            if span1 != span2:
+                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
+                if overlap_box is not None:
+                    bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                    if bbox_to_remove is not None:
+                        spans.remove(bbox_to_remove)
+    return spans
+
+
+def merge_spans_to_line(spans):
+    # 按照y0坐标排序
+    spans.sort(key=lambda span: span['bbox'][1])
+
+    lines = []
+    current_line = [spans[0]]
+    for span in spans[1:]:
+        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
+        # image和table类型，同上
+        if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+            # 则开始新行
+            lines.append(current_line)
+            current_line = [span]
+            continue
+
+        # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
+        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+            current_line.append(span)
+        else:
+            # 否则，开始新行
+            lines.append(current_line)
+            current_line = [span]
+
+    # 添加最后一行
+    if current_line:
+        lines.append(current_line)
+
+    # 计算每行的边界框，并对每行中的span按照x0进行排序
+    line_objects = []
+    for line in lines:
+        # 按照x0坐标排序
+        line.sort(key=lambda span: span['bbox'][0])
+        line_bbox = [
+            min(span['bbox'][0] for span in line),  # x0
+            min(span['bbox'][1] for span in line),  # y0
+            max(span['bbox'][2] for span in line),  # x1
+            max(span['bbox'][3] for span in line),  # y1
+        ]
+        line_objects.append({
+            "bbox": line_bbox,
+            "spans": line,
+        })
+
+    return line_objects
--- a/magic_pdf/pre_proc/solve_line_alien.py
+++ b/magic_pdf/pre_proc/solve_line_alien.py
@@ -0,0 +1,29 @@
+def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict:  # text_block -> json中的preproc_block
+    """解决行内文本间距过大问题"""
+    for i in range(len(pdf_info_dict)):
+
+        text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
+
+        for block in text_blocks:
+
+            x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
+            
+            for line in block['lines']:
+
+                x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
+                # line_box = [x1, y1, x2, y2] 
+                if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
+                    # if len(line['spans']) == 1:
+                    line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
+                
+                x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox'] 
+
+    return pdf_info_dict
+
+
+
+
+
+
+
+
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
 from setuptools import setup, find_packages
-
+import subprocess
 def parse_requirements(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
@@ -15,12 +15,26 @@ def parse_requirements(filename):

    return requires

+def get_version():
+    command = ["git", "describe", "--tags"]
+    try:
+        version = subprocess.check_output(command).decode().strip()
+        version_parts = version.split("-")
+        if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
+            return version_parts[1]
+        else:
+            raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
+    except Exception as e:
+        print(e)
+        return "0.0.0"
+

 requires = parse_requirements('requirements.txt')

 setup(
    name="magic_pdf",  # 项目名
-    version="0.1.1",  # 版本号
+    # version="0.1.3",  # 版本号
+    version=get_version(),  # 自动从tag中获取版本号
    packages=find_packages(),  # 包含所有的包
    install_requires=requires,  # 项目依赖的第三方库
    python_requires=">=3.9",  # 项目依赖的 Python 版本
--- a/tests/test_commons.py
+++ b/tests/test_commons.py
@@ -1,10 +1,10 @@
 import io
 import json
 import os
-from magic_pdf.libs import fitz
+from magic_pdf.libs.commons import fitz

 from app.common.s3 import get_s3_config, get_s3_client
-from magic_pdf.libs import join_path, json_dump_path, read_file, parse_bucket_key
+from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
 from loguru import logger

 test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
--- a/tests/test_metascan_classify/test_classify.py
+++ b/tests/test_metascan_classify/test_classify.py
@@ -2,10 +2,10 @@ import os

 import pytest

-from magic_pdf.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \
+from magic_pdf.filter.pdf_classify_by_type import classify_by_area, classify_by_text_len, classify_by_avg_words, \
    classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips
 from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page
-from test.test_commons import get_docs_from_test_pdf, get_test_json_data
+from tests.test_commons import get_docs_from_test_pdf, get_test_json_data

 # 获取当前目录
 current_directory = os.path.dirname(os.path.abspath(__file__))
--- a/tests/test_metascan_classify/test_meta_scan.py
+++ b/tests/test_metascan_classify/test_meta_scan.py
@@ -2,7 +2,7 @@ import os

 import pytest
 from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_image_info, get_pdf_text_layout_per_page, get_language
-from test.test_commons import get_docs_from_test_pdf, get_test_json_data
+from tests.test_commons import get_docs_from_test_pdf, get_test_json_data

 # 获取当前目录
 current_directory = os.path.dirname(os.path.abspath(__file__))
--- a/tests/test_para/test_para_pipeline.py
+++ b/tests/test_para/test_para_pipeline.py
@@ -11,21 +11,21 @@ Execute the following command to run the tests under directory code-clean:
    
 """

-from test.test_para.test_pdf2text_recogPara_Common import (
+from tests.test_para.test_pdf2text_recogPara_Common import (
    TestIsBboxOverlap,
    TestIsInBbox,
    TestIsBboxOverlap,
    TestIsLineLeftAlignedFromNeighbors,
    TestIsLineRightAlignedFromNeighbors,
 )
-from test.test_para.test_pdf2text_recogPara_EquationsProcessor import TestCalcOverlapPct
-from test.test_para.test_pdf2text_recogPara_BlockInnerParasProcessor import TestIsConsistentLines
-from test.test_para.test_pdf2text_recogPara_BlockContinuationProcessor import (
+from tests.test_para.test_pdf2text_recogPara_EquationsProcessor import TestCalcOverlapPct
+from tests.test_para.test_pdf2text_recogPara_BlockInnerParasProcessor import TestIsConsistentLines
+from tests.test_para.test_pdf2text_recogPara_BlockContinuationProcessor import (
    TestIsAlphabetChar,
    TestIsChineseChar,
    TestIsOtherLetterChar,
 )
-from test.test_para.test_pdf2text_recogPara_TitleProcessor import TestTitleProcessor
+from tests.test_para.test_pdf2text_recogPara_TitleProcessor import TestTitleProcessor


 # Test suite
Author	SHA1	Message	Date
赵小蒙	fcea39d36b	增加ocr模式的layout解析功能	2024-03-07 20:41:41 +08:00
赵小蒙	00f3e329d9	修复一个span可能没有content导致的问题	2024-03-07 16:15:14 +08:00
赵小蒙	caa1588a92	ocr拼接逻辑更新	2024-03-07 16:04:16 +08:00
赵小蒙	a0be4652e6	parse_pdf_by_ocr 逻辑更新	2024-03-06 18:03:55 +08:00
赵小蒙	701f384994	增加ocr版本解析功能	2024-03-06 17:48:29 +08:00
赵小蒙	2e487cac34	修复目录重构导致的引用异常	2024-03-06 15:12:28 +08:00
赵小蒙	846dbecf45	更新readme	2024-03-06 14:58:36 +08:00
赵小蒙	a706743372	setup从tag中自动获取版本号	2024-03-05 15:05:51 +08:00
赵小蒙	7242a4a76e	更新模块版本号	2024-03-05 12:17:02 +08:00
赵小蒙	6cbf7fabcf	更新模块版本号	2024-03-05 12:03:12 +08:00
赵小蒙	5b9fa871bd	Merge remote-tracking branch 'origin/master'	2024-03-05 12:00:07 +08:00
hsy	bc339320ab	增加了一个solve_line_alien.py，在detect_equation.py中修改了__solve_contain_bboxs函数，并在pdf_parse_by_model.py里增加了函数solve_line_alien.py的调用	2024-03-05 10:58:02 +08:00