mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
4 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
efed5faa53 | ||
|
|
05161c6e62 | ||
|
|
15c8830416 | ||
|
|
432e1ae5e3 |
@@ -253,7 +253,8 @@ def parse_pdf_for_train(
|
||||
# isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json)
|
||||
接下来开始进行预处理过程
|
||||
"""
|
||||
|
||||
# title_bboxs = parse_titles(page_id, page, model_output_json)
|
||||
|
||||
"""去掉每页的页码、页眉、页脚"""
|
||||
page_no_bboxs = parse_pageNos(page_id, page, model_output_json)
|
||||
header_bboxs = parse_headers(page_id, page, model_output_json)
|
||||
@@ -530,6 +531,7 @@ def parse_pdf_for_train(
|
||||
page_info["bak_page_no_bboxes"] = page_no_bboxs
|
||||
page_info["bak_header_bboxes"] = header_bboxs
|
||||
page_info["bak_footer_bboxes"] = footer_bboxs
|
||||
page_info["bak_footer_note_bboxes"] = footnote_bboxes_tmp
|
||||
|
||||
pdf_info_dict[f"page_{page_id}"] = page_info
|
||||
|
||||
|
||||
@@ -35,8 +35,16 @@ def convert_to_train_format(jso: dict) -> []:
|
||||
|
||||
# 脚注, 目前没有看到例子
|
||||
for para in v["para_blocks"]:
|
||||
n_bbox = {"category_id": 2, "bbox": para["bbox"]}
|
||||
bboxes.append(n_bbox)
|
||||
if "paras" in para:
|
||||
paras = para["paras"]
|
||||
for para_key, para_content in paras.items():
|
||||
para_bbox = para_content["para_bbox"]
|
||||
is_para_title = para_content["is_para_title"]
|
||||
if is_para_title:
|
||||
n_bbox = {"category_id": 0, "bbox": para_bbox}
|
||||
else:
|
||||
n_bbox = {"category_id": 2, "bbox": para_bbox}
|
||||
bboxes.append(n_bbox)
|
||||
|
||||
for inline_equation in v["inline_equations"]:
|
||||
n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
|
||||
@@ -46,6 +54,10 @@ def convert_to_train_format(jso: dict) -> []:
|
||||
n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
|
||||
bboxes.append(n_bbox)
|
||||
|
||||
for footnote_bbox in v["bak_footer_note_bboxes"]:
|
||||
n_bbox = {"category_id": 5, "bbox": list(footnote_bbox)}
|
||||
bboxes.append(n_bbox)
|
||||
|
||||
info["bboxes"] = bboxes
|
||||
info["layout_tree"] = v["layout_bboxes"]
|
||||
pages.append(info)
|
||||
|
||||
Reference in New Issue
Block a user