mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-04-12 15:29:03 +07:00
Compare commits
12 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fcea39d36b | ||
|
|
00f3e329d9 | ||
|
|
caa1588a92 | ||
|
|
a0be4652e6 | ||
|
|
701f384994 | ||
|
|
2e487cac34 | ||
|
|
846dbecf45 | ||
|
|
a706743372 | ||
|
|
7242a4a76e | ||
|
|
6cbf7fabcf | ||
|
|
5b9fa871bd | ||
|
|
bc339320ab |
10
.github/workflows/python-package.yml
vendored
10
.github/workflows/python-package.yml
vendored
@@ -20,21 +20,29 @@ jobs:
|
||||
python-version: ["3.10"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
|
||||
- name: Install wheel
|
||||
run: |
|
||||
python -m pip install wheel
|
||||
|
||||
- name: Build wheel
|
||||
run: |
|
||||
python setup.py bdist_wheel
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
|
||||
11
README.md
11
README.md
@@ -19,6 +19,17 @@ python 3.9+
|
||||
git clone https://github.com/myhloli/Magic-PDF.git
|
||||
```
|
||||
|
||||
2.Install the requirements
|
||||
|
||||
```sh
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3.Run the main script
|
||||
|
||||
```sh
|
||||
use demo/demo_test.py
|
||||
```
|
||||
|
||||
### 版权说明
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
|
||||
from magic_pdf.libs import join_path
|
||||
from magic_pdf.libs.commons import join_path
|
||||
|
||||
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
|
||||
samples = json.load(f)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from magic_pdf.libs import fitz # PyMuPDF
|
||||
from magic_pdf.libs.commons import fitz # PyMuPDF
|
||||
|
||||
# PDF文件路径
|
||||
pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
|
||||
|
||||
39
demo/ocr_demo.py
Normal file
39
demo/ocr_demo.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
|
||||
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
||||
|
||||
|
||||
def save_markdown(markdown_text, input_filepath):
|
||||
# 获取输入文件的目录
|
||||
directory = os.path.dirname(input_filepath)
|
||||
# 获取输入文件的文件名(不带扩展名)
|
||||
base_name = os.path.basename(input_filepath)
|
||||
file_name_without_ext = os.path.splitext(base_name)[0]
|
||||
# 定义输出文件的路径
|
||||
output_filepath = os.path.join(directory, f"{file_name_without_ext}.md")
|
||||
|
||||
# 将Markdown文本写入.md文件
|
||||
with open(output_filepath, 'w', encoding='utf-8') as file:
|
||||
file.write(markdown_text)
|
||||
|
||||
|
||||
def read_json_file(file_path):
|
||||
with open(file_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_1(3).json"
|
||||
try:
|
||||
ocr_pdf_info = read_json_file(ocr_json_file_path)
|
||||
pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
|
||||
markdown_text = mk_nlp_markdown(pdf_info_dict)
|
||||
logger.info(markdown_text)
|
||||
save_markdown(markdown_text, ocr_json_file_path)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
@@ -5,7 +5,7 @@ from pathlib import Path
|
||||
import click
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs import join_path
|
||||
from magic_pdf.libs.commons import join_path
|
||||
from magic_pdf.dict2md.mkcontent import mk_mm_markdown
|
||||
from magic_pdf.pipeline import parse_pdf_by_model
|
||||
|
||||
|
||||
23
magic_pdf/dict2md/ocr_mkcontent.py
Normal file
23
magic_pdf/dict2md/ocr_mkcontent.py
Normal file
@@ -0,0 +1,23 @@
|
||||
def mk_nlp_markdown(pdf_info_dict: dict):
|
||||
|
||||
markdown = []
|
||||
|
||||
for _, page_info in pdf_info_dict.items():
|
||||
blocks = page_info.get("preproc_blocks")
|
||||
if not blocks:
|
||||
continue
|
||||
for block in blocks:
|
||||
for line in block['lines']:
|
||||
line_text = ''
|
||||
for span in line['spans']:
|
||||
if not span.get('content'):
|
||||
continue
|
||||
content = span['content'].replace('$', '\$') # 转义$
|
||||
if span['type'] == 'inline_equation':
|
||||
content = f"${content}$"
|
||||
elif span['type'] == 'displayed_equation':
|
||||
content = f"$$\n{content}\n$$"
|
||||
line_text += content + ' '
|
||||
# 在行末添加两个空格以强制换行
|
||||
markdown.append(line_text.strip() + ' ')
|
||||
return '\n'.join(markdown)
|
||||
@@ -119,6 +119,20 @@ def _is_left_overlap(box1, box2,):
|
||||
return x0_1<=x0_2<=x1_1 and vertical_overlap_cond
|
||||
|
||||
|
||||
def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
|
||||
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
|
||||
_, y0_1, _, y1_1 = bbox1
|
||||
_, y0_2, _, y1_2 = bbox2
|
||||
|
||||
overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
|
||||
height1, height2 = y1_1 - y0_1, y1_2 - y0_2
|
||||
max_height = max(height1, height2)
|
||||
min_height = min(height1, height2)
|
||||
|
||||
return (overlap / min_height) > overlap_ratio_threshold
|
||||
|
||||
|
||||
|
||||
def calculate_iou(bbox1, bbox2):
|
||||
# Determine the coordinates of the intersection rectangle
|
||||
x_left = max(bbox1[0], bbox2[0])
|
||||
@@ -163,7 +177,25 @@ def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
|
||||
else:
|
||||
return intersection_area / min_box_area
|
||||
|
||||
|
||||
|
||||
def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
|
||||
"""
|
||||
通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
|
||||
如果比例大于ratio,则返回小的那个bbox,
|
||||
否则返回None
|
||||
"""
|
||||
x1_min, y1_min, x1_max, y1_max = bbox1
|
||||
x2_min, y2_min, x2_max, y2_max = bbox2
|
||||
area1 = (x1_max - x1_min) * (y1_max - y1_min)
|
||||
area2 = (x2_max - x2_min) * (y2_max - y2_min)
|
||||
overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
|
||||
if overlap_ratio > ratio and area1 < area2:
|
||||
return bbox1
|
||||
elif overlap_ratio > ratio and area2 < area1:
|
||||
return bbox2
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
|
||||
x0, y0, x1, y1 = boundry
|
||||
new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]
|
||||
|
||||
@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
|
||||
from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
|
||||
from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
|
||||
from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
|
||||
from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval
|
||||
|
||||
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
|
||||
titleDetectionException_msg = TitleDetectionException().message
|
||||
@@ -446,6 +447,10 @@ def parse_pdf_by_model(
|
||||
==================================================================================================================================
|
||||
进入段落处理-2阶段
|
||||
"""
|
||||
|
||||
# 处理行内文字间距较大问题
|
||||
pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
para_process_pipeline = ParaProcessPipeline()
|
||||
|
||||
93
magic_pdf/pdf_parse_by_ocr.py
Normal file
93
magic_pdf/pdf_parse_by_ocr.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
|
||||
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
|
||||
|
||||
|
||||
def construct_page_component(page_id, blocks, layout_bboxes):
|
||||
return_dict = {
|
||||
'preproc_blocks': blocks,
|
||||
'page_idx': page_id,
|
||||
'layout_bboxes': layout_bboxes,
|
||||
}
|
||||
return return_dict
|
||||
|
||||
|
||||
def parse_pdf_by_ocr(
|
||||
ocr_pdf_info,
|
||||
start_page_id=0,
|
||||
end_page_id=None,
|
||||
):
|
||||
|
||||
pdf_info_dict = {}
|
||||
end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
|
||||
for page_id in range(start_page_id, end_page_id + 1):
|
||||
ocr_page_info = ocr_pdf_info[page_id]
|
||||
layout_dets = ocr_page_info['layout_dets']
|
||||
spans = []
|
||||
for layout_det in layout_dets:
|
||||
category_id = layout_det['category_id']
|
||||
allow_category_id_list = [1, 7, 13, 14, 15]
|
||||
if category_id in allow_category_id_list:
|
||||
x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
|
||||
bbox = [int(x0), int(y0), int(x1), int(y1)]
|
||||
'''要删除的'''
|
||||
# 3: 'header', # 页眉
|
||||
# 4: 'page number', # 页码
|
||||
# 5: 'footnote', # 脚注
|
||||
# 6: 'footer', # 页脚
|
||||
'''当成span拼接的'''
|
||||
# 1: 'image', # 图片
|
||||
# 7: 'table', # 表格
|
||||
# 13: 'inline_equation', # 行内公式
|
||||
# 14: 'displayed_equation', # 行间公式
|
||||
# 15: 'text', # ocr识别文本
|
||||
'''layout信息'''
|
||||
# 11: 'full column', # 单栏
|
||||
# 12: 'sub column', # 多栏
|
||||
span = {
|
||||
'bbox': bbox,
|
||||
}
|
||||
if category_id == 1:
|
||||
span['type'] = 'image'
|
||||
elif category_id == 7:
|
||||
span['type'] = 'table'
|
||||
elif category_id == 13:
|
||||
span['content'] = layout_det['latex']
|
||||
span['type'] = 'inline_equation'
|
||||
elif category_id == 14:
|
||||
span['content'] = layout_det['latex']
|
||||
span['type'] = 'displayed_equation'
|
||||
elif category_id == 15:
|
||||
span['content'] = layout_det['text']
|
||||
span['type'] = 'text'
|
||||
# print(span)
|
||||
spans.append(span)
|
||||
else:
|
||||
continue
|
||||
|
||||
# 删除重叠spans中较小的那些
|
||||
spans = remove_overlaps_min_spans(spans)
|
||||
|
||||
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
|
||||
|
||||
|
||||
# 将spans合并成line(从上到下,从左到右)
|
||||
lines = merge_spans_to_line(spans)
|
||||
# logger.info(lines)
|
||||
|
||||
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
|
||||
blocks = []
|
||||
for line in lines:
|
||||
blocks.append({
|
||||
"bbox": line['bbox'],
|
||||
"lines": [line],
|
||||
})
|
||||
|
||||
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
|
||||
layout_bboxes = layout_detect(ocr_page_info['subfield_dets'])
|
||||
|
||||
# 构造pdf_info_dict
|
||||
page_info = construct_page_component(page_id, blocks, layout_bboxes)
|
||||
pdf_info_dict[f"page_{page_id}"] = page_info
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from magic_pdf.libs.boxbase import _is_in # 正则
|
||||
from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio # 正则
|
||||
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
||||
|
||||
|
||||
@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list):
|
||||
dump_list.append(all_bbox_list[i])
|
||||
elif _is_in(bbox2, bbox1):
|
||||
dump_list.append(all_bbox_list[j])
|
||||
|
||||
else:
|
||||
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
|
||||
if ratio > 0.7:
|
||||
s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
||||
s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
|
||||
if s2 > s1:
|
||||
dump_list.append(all_bbox_list[i])
|
||||
else:
|
||||
dump_list.append(all_bbox_list[i])
|
||||
|
||||
# 遍历需要删除的列表中的每个元素
|
||||
for item in dump_list:
|
||||
|
||||
|
||||
123
magic_pdf/pre_proc/ocr_detect_layout.py
Normal file
123
magic_pdf/pre_proc/ocr_detect_layout.py
Normal file
@@ -0,0 +1,123 @@
|
||||
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
|
||||
|
||||
def get_center_point(bbox):
|
||||
"""
|
||||
根据边界框坐标信息,计算出该边界框的中心点坐标。
|
||||
Args:
|
||||
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
|
||||
Returns:
|
||||
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
|
||||
"""
|
||||
return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
|
||||
|
||||
|
||||
def get_area(bbox):
|
||||
"""
|
||||
根据边界框坐标信息,计算出该边界框的面积。
|
||||
Args:
|
||||
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
|
||||
Returns:
|
||||
float: 该边界框的面积。
|
||||
"""
|
||||
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
||||
|
||||
|
||||
def adjust_layouts(layout_bboxes):
|
||||
# 遍历所有布局框
|
||||
for i in range(len(layout_bboxes)):
|
||||
# 遍历当前布局框之后的布局框
|
||||
for j in range(i + 1, len(layout_bboxes)):
|
||||
# 判断两个布局框是否重叠
|
||||
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
|
||||
# 计算每个布局框的中心点坐标和面积
|
||||
center_i = get_center_point(layout_bboxes[i]["layout_bbox"])
|
||||
area_i = get_area(layout_bboxes[i]["layout_bbox"])
|
||||
|
||||
center_j = get_center_point(layout_bboxes[j]["layout_bbox"])
|
||||
area_j = get_area(layout_bboxes[j]["layout_bbox"])
|
||||
|
||||
# 计算横向和纵向的距离差
|
||||
dx = abs(center_i[0] - center_j[0])
|
||||
dy = abs(center_i[1] - center_j[1])
|
||||
|
||||
# 较大布局框和较小布局框的赋值
|
||||
if area_i > area_j:
|
||||
larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
|
||||
else:
|
||||
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
|
||||
|
||||
# 根据距离差判断重叠方向并修正边界
|
||||
if dx > dy: # 左右重叠
|
||||
if larger_layout["layout_bbox"][0] < smaller_layout["layout_bbox"][2]:
|
||||
larger_layout["layout_bbox"][0] = smaller_layout["layout_bbox"][2]
|
||||
else:
|
||||
larger_layout["layout_bbox"][2] = smaller_layout["layout_bbox"][0]
|
||||
else: # 上下重叠
|
||||
if larger_layout["layout_bbox"][1] < smaller_layout["layout_bbox"][3]:
|
||||
larger_layout["layout_bbox"][1] = smaller_layout["layout_bbox"][3]
|
||||
else:
|
||||
larger_layout["layout_bbox"][3] = smaller_layout["layout_bbox"][1]
|
||||
|
||||
# 返回排序调整后的布局边界框列表
|
||||
return layout_bboxes
|
||||
|
||||
|
||||
|
||||
|
||||
def layout_detect(layout_info):
|
||||
"""
|
||||
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
|
||||
|
||||
Args:
|
||||
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
|
||||
|
||||
Returns:
|
||||
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
|
||||
|
||||
"""
|
||||
# 初始化布局边界框列表
|
||||
layout_bboxes = []
|
||||
# 遍历每个子布局
|
||||
for sub_layout in layout_info:
|
||||
# 提取子布局的边界框坐标信息
|
||||
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
|
||||
# 创建子布局的边界框字典
|
||||
layout_bbox = {
|
||||
"layout_bbox": [x0, y0, x1, y1],
|
||||
}
|
||||
# 将子布局的边界框添加到列表中
|
||||
layout_bboxes.append(layout_bbox)
|
||||
|
||||
# 初始化新的布局边界框列表
|
||||
new_layout_bboxes = []
|
||||
# 遍历每个布局边界框
|
||||
for i in range(len(layout_bboxes)):
|
||||
# 初始化标记变量,用于判断当前边界框是否需要保留
|
||||
keep = True
|
||||
# 获取当前边界框的坐标信息
|
||||
box_i = layout_bboxes[i]["layout_bbox"]
|
||||
|
||||
# 遍历其他边界框
|
||||
for j in range(len(layout_bboxes)):
|
||||
# 排除当前边界框自身
|
||||
if i != j:
|
||||
# 获取其他边界框的坐标信息
|
||||
box_j = layout_bboxes[j]["layout_bbox"]
|
||||
# 检测box_i是否被box_j包含
|
||||
if _is_in(box_i, box_j):
|
||||
# 如果当前边界框被其他边界框包含,则标记为不需要保留
|
||||
keep = False
|
||||
# 跳出内层循环
|
||||
break
|
||||
|
||||
# 如果当前边界框需要保留,则添加到新的布局边界框列表中
|
||||
if keep:
|
||||
new_layout_bboxes.append(layout_bboxes[i])
|
||||
|
||||
# 对新的布局边界框列表进行排序调整
|
||||
layout_bboxes = adjust_layouts(new_layout_bboxes)
|
||||
|
||||
# 返回排序调整后的布局边界框列表
|
||||
return layout_bboxes
|
||||
|
||||
|
||||
60
magic_pdf/pre_proc/ocr_dict_merge.py
Normal file
60
magic_pdf/pre_proc/ocr_dict_merge.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
|
||||
|
||||
|
||||
# 删除重叠spans中较小的那些
|
||||
def remove_overlaps_min_spans(spans):
|
||||
for span1 in spans.copy():
|
||||
for span2 in spans.copy():
|
||||
if span1 != span2:
|
||||
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
|
||||
if overlap_box is not None:
|
||||
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
|
||||
if bbox_to_remove is not None:
|
||||
spans.remove(bbox_to_remove)
|
||||
return spans
|
||||
|
||||
|
||||
def merge_spans_to_line(spans):
|
||||
# 按照y0坐标排序
|
||||
spans.sort(key=lambda span: span['bbox'][1])
|
||||
|
||||
lines = []
|
||||
current_line = [spans[0]]
|
||||
for span in spans[1:]:
|
||||
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
|
||||
# image和table类型,同上
|
||||
if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
|
||||
# 则开始新行
|
||||
lines.append(current_line)
|
||||
current_line = [span]
|
||||
continue
|
||||
|
||||
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
||||
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
||||
current_line.append(span)
|
||||
else:
|
||||
# 否则,开始新行
|
||||
lines.append(current_line)
|
||||
current_line = [span]
|
||||
|
||||
# 添加最后一行
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
|
||||
# 计算每行的边界框,并对每行中的span按照x0进行排序
|
||||
line_objects = []
|
||||
for line in lines:
|
||||
# 按照x0坐标排序
|
||||
line.sort(key=lambda span: span['bbox'][0])
|
||||
line_bbox = [
|
||||
min(span['bbox'][0] for span in line), # x0
|
||||
min(span['bbox'][1] for span in line), # y0
|
||||
max(span['bbox'][2] for span in line), # x1
|
||||
max(span['bbox'][3] for span in line), # y1
|
||||
]
|
||||
line_objects.append({
|
||||
"bbox": line_bbox,
|
||||
"spans": line,
|
||||
})
|
||||
|
||||
return line_objects
|
||||
29
magic_pdf/pre_proc/solve_line_alien.py
Normal file
29
magic_pdf/pre_proc/solve_line_alien.py
Normal file
@@ -0,0 +1,29 @@
|
||||
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
|
||||
"""解决行内文本间距过大问题"""
|
||||
for i in range(len(pdf_info_dict)):
|
||||
|
||||
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
|
||||
|
||||
for block in text_blocks:
|
||||
|
||||
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
|
||||
|
||||
for line in block['lines']:
|
||||
|
||||
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
|
||||
# line_box = [x1, y1, x2, y2]
|
||||
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
|
||||
# if len(line['spans']) == 1:
|
||||
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
|
||||
|
||||
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
18
setup.py
18
setup.py
@@ -1,5 +1,5 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
import subprocess
|
||||
def parse_requirements(filename):
|
||||
with open(filename) as f:
|
||||
lines = f.read().splitlines()
|
||||
@@ -15,12 +15,26 @@ def parse_requirements(filename):
|
||||
|
||||
return requires
|
||||
|
||||
def get_version():
|
||||
command = ["git", "describe", "--tags"]
|
||||
try:
|
||||
version = subprocess.check_output(command).decode().strip()
|
||||
version_parts = version.split("-")
|
||||
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
|
||||
return version_parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
requires = parse_requirements('requirements.txt')
|
||||
|
||||
setup(
|
||||
name="magic_pdf", # 项目名
|
||||
version="0.1.1", # 版本号
|
||||
# version="0.1.3", # 版本号
|
||||
version=get_version(), # 自动从tag中获取版本号
|
||||
packages=find_packages(), # 包含所有的包
|
||||
install_requires=requires, # 项目依赖的第三方库
|
||||
python_requires=">=3.9", # 项目依赖的 Python 版本
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from magic_pdf.libs import fitz
|
||||
from magic_pdf.libs.commons import fitz
|
||||
|
||||
from app.common.s3 import get_s3_config, get_s3_client
|
||||
from magic_pdf.libs import join_path, json_dump_path, read_file, parse_bucket_key
|
||||
from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
|
||||
from loguru import logger
|
||||
|
||||
test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
|
||||
|
||||
@@ -2,10 +2,10 @@ import os
|
||||
|
||||
import pytest
|
||||
|
||||
from magic_pdf.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \
|
||||
from magic_pdf.filter.pdf_classify_by_type import classify_by_area, classify_by_text_len, classify_by_avg_words, \
|
||||
classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips
|
||||
from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page
|
||||
from test.test_commons import get_docs_from_test_pdf, get_test_json_data
|
||||
from tests.test_commons import get_docs_from_test_pdf, get_test_json_data
|
||||
|
||||
# 获取当前目录
|
||||
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
|
||||
import pytest
|
||||
from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_image_info, get_pdf_text_layout_per_page, get_language
|
||||
from test.test_commons import get_docs_from_test_pdf, get_test_json_data
|
||||
from tests.test_commons import get_docs_from_test_pdf, get_test_json_data
|
||||
|
||||
# 获取当前目录
|
||||
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@@ -11,21 +11,21 @@ Execute the following command to run the tests under directory code-clean:
|
||||
|
||||
"""
|
||||
|
||||
from test.test_para.test_pdf2text_recogPara_Common import (
|
||||
from tests.test_para.test_pdf2text_recogPara_Common import (
|
||||
TestIsBboxOverlap,
|
||||
TestIsInBbox,
|
||||
TestIsBboxOverlap,
|
||||
TestIsLineLeftAlignedFromNeighbors,
|
||||
TestIsLineRightAlignedFromNeighbors,
|
||||
)
|
||||
from test.test_para.test_pdf2text_recogPara_EquationsProcessor import TestCalcOverlapPct
|
||||
from test.test_para.test_pdf2text_recogPara_BlockInnerParasProcessor import TestIsConsistentLines
|
||||
from test.test_para.test_pdf2text_recogPara_BlockContinuationProcessor import (
|
||||
from tests.test_para.test_pdf2text_recogPara_EquationsProcessor import TestCalcOverlapPct
|
||||
from tests.test_para.test_pdf2text_recogPara_BlockInnerParasProcessor import TestIsConsistentLines
|
||||
from tests.test_para.test_pdf2text_recogPara_BlockContinuationProcessor import (
|
||||
TestIsAlphabetChar,
|
||||
TestIsChineseChar,
|
||||
TestIsOtherLetterChar,
|
||||
)
|
||||
from test.test_para.test_pdf2text_recogPara_TitleProcessor import TestTitleProcessor
|
||||
from tests.test_para.test_pdf2text_recogPara_TitleProcessor import TestTitleProcessor
|
||||
|
||||
|
||||
# Test suite
|
||||
|
||||
Reference in New Issue
Block a user