diff --git a/.gitignore b/.gitignore index 1451c8bf..97329d1f 100644 --- a/.gitignore +++ b/.gitignore @@ -31,5 +31,6 @@ tmp .vscode .vscode/ /tests/ +ocr_demo /app/common/__init__.py diff --git a/demo/ocr_demo.py b/demo/ocr_demo.py index 7f3b3674..00c055f0 100644 --- a/demo/ocr_demo.py +++ b/demo/ocr_demo.py @@ -4,7 +4,7 @@ import os from loguru import logger from pathlib import Path -from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown +from magic_pdf.dict2md.ocr_mkcontent import mk_mm_markdown2, mk_nlp_markdown, mk_mm_markdown from magic_pdf.libs.commons import join_path from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr @@ -34,8 +34,9 @@ if __name__ == '__main__': ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json" # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf" # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json" - # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf" - # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix.json" + + ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" + ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" try: ocr_pdf_model_info = read_json_file(ocr_json_file_path) pth = Path(ocr_json_file_path) @@ -56,8 +57,8 @@ if __name__ == '__main__': if not os.path.exists(parent_dir): os.makedirs(parent_dir) - # markdown_content = ocr_mk_nlp_markdown(pdf_info_dict) - markdown_content = ocr_mk_mm_markdown(pdf_info_dict) + # markdown_content = mk_nlp_markdown(pdf_info_dict) + markdown_content = mk_mm_markdown2(pdf_info_dict) with open(text_content_save_path, "w", encoding="utf-8") as f: f.write(markdown_content) diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index fbba2aad..f66c443a 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -53,3 +53,30 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict): # 在行末添加两个空格以强制换行 markdown.append(line_text.strip() + ' ') return '\n'.join(markdown) + +def mk_mm_markdown2(pdf_info_dict:dict): + markdown = [] + for _, page_info in pdf_info_dict.items(): + paras = page_info.get("para_blocks") + if not paras: + continue + for para in paras: + para_text = '' + for line in para: + for span in line['spans']: + span_type = span.get('type') + if span_type == 'text': + para_text += span['content'] + elif span_type == 'inline_equation': + para_text += f" ${span['content']}$ " + elif span_type == 'displayed_equation': + para_text += f"$$\n{span['content']}\n$$ " + elif span_type == 'image': + para_text += f"![]({span['image_path']}) " + markdown.append(para_text) + + return '\n\n'.join(markdown) + + + + \ No newline at end of file diff --git a/magic_pdf/para/para_split.py b/magic_pdf/para/para_split.py new file mode 100644 index 00000000..27a80b1f --- /dev/null +++ b/magic_pdf/para/para_split.py @@ -0,0 +1,206 @@ +from sklearn.cluster import DBSCAN +import numpy as np +from loguru import logger + +from magic_pdf.libs.boxbase import _is_in + + +LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?',":", ":", ")", ")", ";"] +INLINE_EQUATION = 'inline_equation' +INTER_EQUATION = "displayed_equation" +TEXT = "text" + +def __add_line_period(blocks, layout_bboxes): + """ + 为每行添加句号 + 如果这个行 + 1. 以行内公式结尾,但没有任何标点符号,此时加个句号,认为他就是段落结尾。 + """ + for block in blocks: + for line in block['lines']: + last_span = line['spans'][-1] + span_type = last_span['type'] + if span_type in [TEXT, INLINE_EQUATION]: + span_content = last_span['content'].strip() + if span_type==INLINE_EQUATION and span_content[-1] not in LINE_STOP_FLAG: + if span_type in [INLINE_EQUATION, INTER_EQUATION]: + last_span['content'] = span_content + '.' + + + +def __valign_lines(blocks, layout_bboxes): + """ + 对齐行的左侧和右侧。 + 扫描行的左侧和右侧,如果x0, x1差距不超过3就强行对齐到所处layout的左右两侧(和layout有一段距离)。 + 3是个经验值,TODO,计算得来 + + """ + + min_distance = 3 + min_sample = 2 + + for layout_box in layout_bboxes: + blocks_in_layoutbox = [b for b in blocks if _is_in(b['bbox'], layout_box['layout_bbox'])] + if len(blocks_in_layoutbox)==0: + continue + + x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']]) + x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']]) + x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst) + x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst) + x0_uniq_label = np.unique(x0_clusters.labels_) + x1_uniq_label = np.unique(x1_clusters.labels_) + + x0_2_new_val = {} # 存储旧值对应的新值映射 + x1_2_new_val = {} + for label in x0_uniq_label: + if label==-1: + continue + x0_index_of_label = np.where(x0_clusters.labels_==label) + x0_raw_val = x0_lst[x0_index_of_label][:,0] + x0_new_val = np.min(x0_lst[x0_index_of_label][:,0]) + x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val}) + for label in x1_uniq_label: + if label==-1: + continue + x1_index_of_label = np.where(x1_clusters.labels_==label) + x1_raw_val = x1_lst[x1_index_of_label][:,0] + x1_new_val = np.max(x1_lst[x1_index_of_label][:,0]) + x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val}) + + for block in blocks_in_layoutbox: + for line in block['lines']: + x0, x1 = line['bbox'][0], line['bbox'][2] + if x0 in x0_2_new_val: + line['bbox'][0] = int(x0_2_new_val[x0]) + + if x1 in x1_2_new_val: + line['bbox'][2] = int(x1_2_new_val[x1]) + # 其余对不齐的保持不动 + + # 由于修改了block里的line长度,现在需要重新计算block的bbox + for block in blocks_in_layoutbox: + block['bbox'] = [min([line['bbox'][0] for line in block['lines']]), + min([line['bbox'][1] for line in block['lines']]), + max([line['bbox'][2] for line in block['lines']]), + max([line['bbox'][3] for line in block['lines']])] + + +def __common_pre_proc(blocks, layout_bboxes): + """ + 不分语言的,对文本进行预处理 + """ + __add_line_period(blocks, layout_bboxes) + __valign_lines(blocks, layout_bboxes) + + +def __pre_proc_zh_blocks(blocks, layout_bboxes): + """ + 对中文文本进行分段预处理 + """ + pass + + +def __pre_proc_en_blocks(blocks, layout_bboxes): + """ + 对英文文本进行分段预处理 + """ + pass + + +def __group_line_by_layout(blocks, layout_bboxes, lang="en"): + """ + 每个layout内的行进行聚合 + """ + # 因为只是一个block一行目前, 一个block就是一个段落 + lines_group = [] + + for lyout in layout_bboxes: + lines = [line for block in blocks if _is_in(block['bbox'], lyout['layout_bbox']) for line in block['lines']] + lines_group.append(lines) + + return lines_group + + +def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_len=10): + """ + lines_group 进行行分段——layout内部进行分段。 + 1. 先计算每个group的左右边界。 + 2. 然后根据行末尾特征进行分段。 + 末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。 + + """ + def get_span_text(span): + c = span.get('content', '') + if len(c)==0: + c = span.get('image-path', '') + + return c + + paras = [] + right_tail_distance = 1.5 * char_avg_len + for lines in lines_group: + if len(lines)==0: + continue + layout_right = max([line['bbox'][2] for line in lines]) + para = [] # 元素是line + for line in lines: + line_text = ''.join([get_span_text(span) for span in line['spans']]) + #logger.info(line_text) + last_span_type = line['spans'][-1]['type'] + if last_span_type in [TEXT, INLINE_EQUATION]: + last_char = line['spans'][-1]['content'][-1] + if last_char in LINE_STOP_FLAG or line['bbox'][2] < layout_right - right_tail_distance: + para.append(line) + paras.append(para) + # para_text = ''.join([span['content'] for line in para for span in line['spans']]) + # logger.info(para_text) + para = [] + else: + para.append(line) + else: # 其他,图片、表格、行间公式,各自占一段 + para.append(line) + paras.append(para) + # para_text = ''.join([get_span_text(span) for line in para for span in line['spans']]) + # logger.info(para_text) + para = [] + if len(para)>0: + paras.append(para) + # para_text = ''.join([get_span_text(span) for line in para for span in line['spans']]) + # logger.info(para_text) + para = [] + + return paras + + +def __do_split(blocks, layout_bboxes, lang="en"): + """ + 根据line和layout情况进行分段 + 先实现一个根据行末尾特征分段的简单方法。 + """ + """ + 算法思路: + 1. 扫描layout里每一行,找出来行尾距离layout有边界有一定距离的行。 + 2. 从上述行中找到末尾是句号等可作为断行标志的行。 + 3. 参照上述行尾特征进行分段。 + 4. 图、表,目前独占一行,不考虑分段。 + """ + lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段 + layout_paras = __split_para_in_layoutbox(lines_group, layout_bboxes, lang) # block间连接分段 + + return layout_paras + + +def para_split(blocks, layout_bboxes, lang="en"): + """ + 根据line和layout情况进行分段 + """ + __common_pre_proc(blocks, layout_bboxes) + if lang=='en': + __do_split(blocks, layout_bboxes, lang) + elif lang=='zh': + __do_split(blocks, layout_bboxes, lang) + + splited_blocks = __do_split(blocks, layout_bboxes, lang) + + return splited_blocks diff --git a/magic_pdf/pdf_parse_by_ocr.py b/magic_pdf/pdf_parse_by_ocr.py index b24c9f0a..3e022384 100644 --- a/magic_pdf/pdf_parse_by_ocr.py +++ b/magic_pdf/pdf_parse_by_ocr.py @@ -16,6 +16,7 @@ from magic_pdf.libs.commons import ( from magic_pdf.libs.coordinate_transform import get_scale_ratio from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.safe_filename import sanitize_filename +from magic_pdf.para.para_split import para_split from magic_pdf.pre_proc.detect_footer_by_model import parse_footers from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model from magic_pdf.pre_proc.detect_header import parse_headers @@ -31,12 +32,13 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox -def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, +def construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, images, tables, interline_equations, inline_equations, dropped_text_block, dropped_image_block, dropped_table_block, need_remove_spans_bboxes_dict): return_dict = { 'preproc_blocks': blocks, + "para_blocks": para_blocks, # 分好段落的blocks 'layout_bboxes': layout_bboxes, 'page_idx': page_id, 'page_size': [page_w, page_h], @@ -234,13 +236,13 @@ def parse_pdf_by_ocr( blocks = merge_lines_to_block(lines) # 根据block合并段落 - - + para_blocks = para_split(blocks, layout_bboxes) + # 获取QA需要外置的list images, tables, interline_equations, inline_equations = get_qa_need_list(blocks) # 构造pdf_info_dict - page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, + page_info = construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, images, tables, interline_equations, inline_equations, dropped_text_block, dropped_image_block, dropped_table_block, need_remove_spans_bboxes_dict)