Compare commits

...

4 Commits

Author SHA1 Message Date
许瑞
efed5faa53 feat: modify foot note bbox tmp 2024-03-23 14:34:25 +08:00
xu rui
05161c6e62 feat: backup footnote_bbox_tmp 2024-03-23 14:11:50 +08:00
xu rui
15c8830416 feat: comment parse_title 2024-03-23 13:15:32 +08:00
xu rui
432e1ae5e3 feat: process title and footnote 2024-03-22 18:11:44 +08:00
2 changed files with 17 additions and 3 deletions

View File

@@ -253,7 +253,8 @@ def parse_pdf_for_train(
# isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json)
接下来开始进行预处理过程
"""
# title_bboxs = parse_titles(page_id, page, model_output_json)
"""去掉每页的页码、页眉、页脚"""
page_no_bboxs = parse_pageNos(page_id, page, model_output_json)
header_bboxs = parse_headers(page_id, page, model_output_json)
@@ -530,6 +531,7 @@ def parse_pdf_for_train(
page_info["bak_page_no_bboxes"] = page_no_bboxs
page_info["bak_header_bboxes"] = header_bboxs
page_info["bak_footer_bboxes"] = footer_bboxs
page_info["bak_footer_note_bboxes"] = footnote_bboxes_tmp
pdf_info_dict[f"page_{page_id}"] = page_info

View File

@@ -35,8 +35,16 @@ def convert_to_train_format(jso: dict) -> []:
# 脚注, 目前没有看到例子
for para in v["para_blocks"]:
n_bbox = {"category_id": 2, "bbox": para["bbox"]}
bboxes.append(n_bbox)
if "paras" in para:
paras = para["paras"]
for para_key, para_content in paras.items():
para_bbox = para_content["para_bbox"]
is_para_title = para_content["is_para_title"]
if is_para_title:
n_bbox = {"category_id": 0, "bbox": para_bbox}
else:
n_bbox = {"category_id": 2, "bbox": para_bbox}
bboxes.append(n_bbox)
for inline_equation in v["inline_equations"]:
n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
@@ -46,6 +54,10 @@ def convert_to_train_format(jso: dict) -> []:
n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
bboxes.append(n_bbox)
for footnote_bbox in v["bak_footer_note_bboxes"]:
n_bbox = {"category_id": 5, "bbox": list(footnote_bbox)}
bboxes.append(n_bbox)
info["bboxes"] = bboxes
info["layout_tree"] = v["layout_bboxes"]
pages.append(info)