mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
ocr_construct_page_component 位置移动
This commit is contained in:
@@ -53,7 +53,7 @@ from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
|
||||
from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
|
||||
from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
|
||||
from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
|
||||
from magic_pdf.pre_proc.construct_paras import construct_page_component
|
||||
from magic_pdf.pre_proc.construct_page_dict import construct_page_component
|
||||
from magic_pdf.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
|
||||
from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
|
||||
from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
|
||||
|
||||
@@ -18,6 +18,7 @@ from magic_pdf.libs.drop_tag import DropTag
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
from magic_pdf.libs.safe_filename import sanitize_filename
|
||||
from magic_pdf.para.para_split import para_split
|
||||
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
|
||||
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
|
||||
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
|
||||
from magic_pdf.pre_proc.detect_header import parse_headers
|
||||
@@ -33,28 +34,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
|
||||
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
|
||||
|
||||
|
||||
def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
||||
images, tables, interline_equations, inline_equations,
|
||||
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
|
||||
need_remove_spans_bboxes_dict):
|
||||
return_dict = {
|
||||
'preproc_blocks': blocks,
|
||||
'layout_bboxes': layout_bboxes,
|
||||
'page_idx': page_id,
|
||||
'page_size': [page_w, page_h],
|
||||
'_layout_tree': layout_tree,
|
||||
'images': images,
|
||||
'tables': tables,
|
||||
'interline_equations': interline_equations,
|
||||
'inline_equations': inline_equations,
|
||||
'droped_text_block': dropped_text_block,
|
||||
'droped_image_block': dropped_image_block,
|
||||
'droped_table_block': dropped_table_block,
|
||||
'dropped_equation_block': dropped_equation_block,
|
||||
'droped_bboxes': need_remove_spans_bboxes_dict,
|
||||
}
|
||||
return return_dict
|
||||
|
||||
|
||||
def parse_pdf_by_ocr(
|
||||
pdf_path,
|
||||
@@ -254,7 +233,7 @@ def parse_pdf_by_ocr(
|
||||
dropped_equation_block.append(span)
|
||||
|
||||
'''构造pdf_info_dict'''
|
||||
page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
||||
page_info = ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
||||
images, tables, interline_equations, inline_equations,
|
||||
dropped_text_block, dropped_image_block, dropped_table_block,
|
||||
dropped_equation_block,
|
||||
|
||||
@@ -75,7 +75,7 @@ from magic_pdf.pre_proc.equations_replace import (
|
||||
)
|
||||
from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
|
||||
from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
|
||||
from magic_pdf.pre_proc.construct_paras import construct_page_component
|
||||
from magic_pdf.pre_proc.construct_page_dict import construct_page_component
|
||||
from magic_pdf.pre_proc.fix_image import (
|
||||
combine_images,
|
||||
fix_image_vertical,
|
||||
|
||||
@@ -28,3 +28,26 @@ def construct_page_component(page_id, image_info, table_info, text_blocks_prepr
|
||||
return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
|
||||
|
||||
return return_dict
|
||||
|
||||
|
||||
def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
||||
images, tables, interline_equations, inline_equations,
|
||||
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
|
||||
need_remove_spans_bboxes_dict):
|
||||
return_dict = {
|
||||
'preproc_blocks': blocks,
|
||||
'layout_bboxes': layout_bboxes,
|
||||
'page_idx': page_id,
|
||||
'page_size': [page_w, page_h],
|
||||
'_layout_tree': layout_tree,
|
||||
'images': images,
|
||||
'tables': tables,
|
||||
'interline_equations': interline_equations,
|
||||
'inline_equations': inline_equations,
|
||||
'droped_text_block': dropped_text_block,
|
||||
'droped_image_block': dropped_image_block,
|
||||
'droped_table_block': dropped_table_block,
|
||||
'dropped_equation_block': dropped_equation_block,
|
||||
'droped_bboxes': need_remove_spans_bboxes_dict,
|
||||
}
|
||||
return return_dict
|
||||
Reference in New Issue
Block a user