mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-04-12 07:06:44 +07:00
Compare commits
5 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3d2fcc9dce | ||
|
|
d3c9cb84f8 | ||
|
|
8c089976ed | ||
|
|
473a0a7de0 | ||
|
|
61c970f7da |
@@ -92,7 +92,8 @@ def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0,
|
||||
if __name__ == '__main__':
|
||||
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
|
||||
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
|
||||
ocr_local_parse(pdf_path, json_file_path)
|
||||
# book_name = "数学新星网/edu_00001236"
|
||||
# ocr_online_parse(book_name)
|
||||
# ocr_local_parse(pdf_path, json_file_path)
|
||||
book_name = "科数网/edu_00011318"
|
||||
ocr_online_parse(book_name)
|
||||
|
||||
pass
|
||||
|
||||
@@ -122,7 +122,10 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
|
||||
pass
|
||||
if content != '':
|
||||
para_text += content + ' '
|
||||
page_markdown.append(para_text.strip() + ' ')
|
||||
if para_text.strip() == '':
|
||||
continue
|
||||
else:
|
||||
page_markdown.append(para_text.strip() + ' ')
|
||||
return page_markdown
|
||||
|
||||
|
||||
|
||||
@@ -320,6 +320,9 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
|
||||
如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。
|
||||
根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。
|
||||
"""
|
||||
if len(layout_paras)==0 or len(layout_list_info)==0: # 0的时候最后的return 会出错
|
||||
return layout_paras, [False, False]
|
||||
|
||||
for i in range(1, len(layout_paras)):
|
||||
pre_layout_list_info = layout_list_info[i-1]
|
||||
next_layout_list_info = layout_list_info[i]
|
||||
@@ -353,6 +356,9 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
||||
如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。
|
||||
根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。
|
||||
"""
|
||||
if len(pre_page_paras)==0 or len(next_page_paras)==0: # 0的时候最后的return 会出错
|
||||
return False
|
||||
|
||||
if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
|
||||
logger.info(f"连接page {page_num} 内的list")
|
||||
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
|
||||
@@ -395,10 +401,19 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
|
||||
|
||||
"""
|
||||
connected_layout_paras = []
|
||||
if len(layout_paras)==0:
|
||||
return connected_layout_paras
|
||||
|
||||
connected_layout_paras.append(layout_paras[0])
|
||||
for i in range(1, len(layout_paras)):
|
||||
pre_last_line = layout_paras[i-1][-1][-1]
|
||||
next_first_line = layout_paras[i][0][0]
|
||||
try:
|
||||
if len(layout_paras[i])==0 or len(layout_paras[i-1])==0: # TODO 考虑连接问题,
|
||||
continue
|
||||
pre_last_line = layout_paras[i-1][-1][-1]
|
||||
next_first_line = layout_paras[i][0][0]
|
||||
except Exception as e:
|
||||
logger.error(f"page layout {i} has no line")
|
||||
continue
|
||||
pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
|
||||
pre_last_line_type = pre_last_line['spans'][-1]['type']
|
||||
next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
|
||||
@@ -435,7 +450,7 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
||||
2. 后一个页面的第一个段落第一行没有空白开头。
|
||||
"""
|
||||
# 有的页面可能压根没有文字
|
||||
if len(pre_page_paras)==0 or len(next_page_paras)==0:
|
||||
if len(pre_page_paras)==0 or len(next_page_paras)==0 or len(pre_page_paras[0])==0 or len(next_page_paras[0])==0: # TODO [[]]为什么出现在pre_page_paras里?
|
||||
return False
|
||||
pre_last_para = pre_page_paras[-1][-1]
|
||||
next_first_para = next_page_paras[0][0]
|
||||
@@ -486,7 +501,7 @@ def find_consecutive_true_regions(input_array):
|
||||
return regions
|
||||
|
||||
|
||||
def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
|
||||
def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode):
|
||||
"""
|
||||
找出来中间对齐的连续单行文本,如果连续行高度相同,那么合并为一个段落。
|
||||
一个line居中的条件是:
|
||||
@@ -512,8 +527,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
|
||||
first_line_text = ''.join([__get_span_text(span) for span in layout_para[start][0]['spans']])
|
||||
if "Table" in first_line_text or "Figure" in first_line_text:
|
||||
pass
|
||||
|
||||
logger.info(line_hi.std())
|
||||
if debug_mode:
|
||||
logger.info(line_hi.std())
|
||||
|
||||
if line_hi.std()<2:
|
||||
"""行高度相同,那么判断是否居中"""
|
||||
@@ -525,7 +540,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
|
||||
and not all([x1==layout_box[2] for x1 in all_right_x1]):
|
||||
merge_para = [l[0] for l in layout_para[start:end+1]]
|
||||
para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
|
||||
logger.info(para_text)
|
||||
if debug_mode:
|
||||
logger.info(para_text)
|
||||
layout_para[start:end+1] = [merge_para]
|
||||
index_offset -= end-start
|
||||
|
||||
@@ -561,7 +577,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
|
||||
return connected_layout_paras, page_list_info
|
||||
|
||||
|
||||
def para_split(pdf_info_dict, lang="en"):
|
||||
def para_split(pdf_info_dict, debug_mode, lang="en"):
|
||||
"""
|
||||
根据line和layout情况进行分段
|
||||
"""
|
||||
@@ -586,13 +602,15 @@ def para_split(pdf_info_dict, lang="en"):
|
||||
pre_page_layout_bbox = new_layout_of_pages[page_num-1]
|
||||
next_page_layout_bbox = new_layout_of_pages[page_num]
|
||||
|
||||
is_conn= __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang)
|
||||
if is_conn:
|
||||
logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
|
||||
is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang)
|
||||
if debug_mode:
|
||||
if is_conn:
|
||||
logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
|
||||
|
||||
is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, all_page_list_info[page_num-1], all_page_list_info[page_num], page_num, lang)
|
||||
if is_list_conn:
|
||||
logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
|
||||
if debug_mode:
|
||||
if is_list_conn:
|
||||
logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
|
||||
|
||||
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
|
||||
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
|
||||
@@ -601,5 +619,5 @@ def para_split(pdf_info_dict, lang="en"):
|
||||
for page_num, page in enumerate(pdf_info_dict.values()):
|
||||
page_paras = page['para_blocks']
|
||||
new_layout_bbox = new_layout_of_pages[page_num]
|
||||
__connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang)
|
||||
__connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode)
|
||||
__merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
|
||||
|
||||
@@ -57,16 +57,16 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
|
||||
|
||||
|
||||
def parse_pdf_by_ocr(
|
||||
pdf_path,
|
||||
s3_pdf_profile,
|
||||
pdf_model_output,
|
||||
save_path,
|
||||
book_name,
|
||||
pdf_model_profile=None,
|
||||
image_s3_config=None,
|
||||
start_page_id=0,
|
||||
end_page_id=None,
|
||||
debug_mode=False,
|
||||
pdf_path,
|
||||
s3_pdf_profile,
|
||||
pdf_model_output,
|
||||
save_path,
|
||||
book_name,
|
||||
pdf_model_profile=None,
|
||||
image_s3_config=None,
|
||||
start_page_id=0,
|
||||
end_page_id=None,
|
||||
debug_mode=False,
|
||||
):
|
||||
pdf_bytes = read_file(pdf_path, s3_pdf_profile)
|
||||
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
|
||||
@@ -95,7 +95,6 @@ def parse_pdf_by_ocr(
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
|
||||
for page_id in range(start_page_id, end_page_id + 1):
|
||||
|
||||
@@ -125,13 +124,6 @@ def parse_pdf_by_ocr(
|
||||
page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode
|
||||
)
|
||||
|
||||
# 构建需要remove的bbox列表
|
||||
# need_remove_spans_bboxes = []
|
||||
# need_remove_spans_bboxes.extend(page_no_bboxes)
|
||||
# need_remove_spans_bboxes.extend(header_bboxes)
|
||||
# need_remove_spans_bboxes.extend(footer_bboxes)
|
||||
# need_remove_spans_bboxes.extend(footnote_bboxes)
|
||||
|
||||
# 构建需要remove的bbox字典
|
||||
need_remove_spans_bboxes_dict = {
|
||||
DropTag.PAGE_NUMBER: page_no_bboxes,
|
||||
@@ -199,50 +191,48 @@ def parse_pdf_by_ocr(
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
# 删除重叠spans中较小的那些
|
||||
'''删除重叠spans中较小的那些'''
|
||||
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
||||
|
||||
# 删除remove_span_block_bboxes中的bbox
|
||||
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
|
||||
# 按qa要求,增加drop相关数据
|
||||
'''
|
||||
删除remove_span_block_bboxes中的bbox
|
||||
并增加drop相关数据
|
||||
'''
|
||||
spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
|
||||
|
||||
# 对image和table截图
|
||||
'''对image和table截图'''
|
||||
spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
|
||||
|
||||
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
|
||||
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
|
||||
displayed_list = []
|
||||
text_inline_lines = []
|
||||
modify_y_axis(spans, displayed_list, text_inline_lines)
|
||||
# 模型识别错误的行间公式, type类型转换成行内公式
|
||||
|
||||
'''模型识别错误的行间公式, type类型转换成行内公式'''
|
||||
spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
|
||||
|
||||
# bbox去除粘连
|
||||
'''bbox去除粘连'''
|
||||
spans = remove_overlap_between_bbox(spans)
|
||||
|
||||
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
|
||||
'''
|
||||
对tpye=["interline_equation", "image", "table"]进行额外处理,
|
||||
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
|
||||
'''
|
||||
spans = adjust_bbox_for_standalone_block(spans)
|
||||
|
||||
|
||||
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
|
||||
'''从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)'''
|
||||
layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
|
||||
|
||||
# 将spans合并成line(在layout内,从上到下,从左到右)
|
||||
'''将spans合并成line(在layout内,从上到下,从左到右)'''
|
||||
lines, dropped_spans_by_layout = merge_spans_to_line_by_layout(spans, layout_bboxes)
|
||||
|
||||
# 将lines合并成block
|
||||
'''将lines合并成block'''
|
||||
blocks = merge_lines_to_block(lines)
|
||||
|
||||
# 根据block合并段落
|
||||
#para_blocks = para_split(blocks, layout_bboxes)
|
||||
|
||||
# 获取QA需要外置的list
|
||||
'''获取QA需要外置的list'''
|
||||
images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
|
||||
|
||||
# drop的span_list合并
|
||||
'''drop的span_list合并'''
|
||||
dropped_spans = []
|
||||
dropped_spans.extend(dropped_spans_by_span_overlap)
|
||||
dropped_spans.extend(dropped_spans_by_removed_bboxes)
|
||||
@@ -263,19 +253,18 @@ def parse_pdf_by_ocr(
|
||||
elif span['type'] in [ContentType.InlineEquation, ContentType.InterlineEquation]:
|
||||
dropped_equation_block.append(span)
|
||||
|
||||
|
||||
|
||||
# 构造pdf_info_dict
|
||||
'''构造pdf_info_dict'''
|
||||
page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
||||
images, tables, interline_equations, inline_equations,
|
||||
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
|
||||
dropped_text_block, dropped_image_block, dropped_table_block,
|
||||
dropped_equation_block,
|
||||
need_remove_spans_bboxes_dict)
|
||||
pdf_info_dict[f"page_{page_id}"] = page_info
|
||||
|
||||
"""分段"""
|
||||
para_split(pdf_info_dict)
|
||||
|
||||
# 在测试时,保存调试信息
|
||||
para_split(pdf_info_dict, debug_mode=debug_mode)
|
||||
|
||||
'''在测试时,保存调试信息'''
|
||||
if debug_mode:
|
||||
params_file_save_path = join_path(
|
||||
save_tmp_path, "md", book_name, "preproc_out.json"
|
||||
|
||||
Reference in New Issue
Block a user