mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-04-12 15:29:03 +07:00
Compare commits
2 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8a2736a53f | ||
|
|
0b35b73c64 |
@@ -4,7 +4,7 @@ import os
|
||||
from loguru import logger
|
||||
from pathlib import Path
|
||||
|
||||
from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown, mk_mm_markdown
|
||||
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
|
||||
from magic_pdf.libs.commons import join_path
|
||||
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
||||
|
||||
@@ -30,12 +30,12 @@ def read_json_file(file_path):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
|
||||
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
|
||||
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
|
||||
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
|
||||
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
|
||||
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
|
||||
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
|
||||
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
|
||||
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
|
||||
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix.json"
|
||||
try:
|
||||
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
|
||||
pth = Path(ocr_json_file_path)
|
||||
@@ -56,8 +56,8 @@ if __name__ == '__main__':
|
||||
if not os.path.exists(parent_dir):
|
||||
os.makedirs(parent_dir)
|
||||
|
||||
# markdown_content = mk_nlp_markdown(pdf_info_dict)
|
||||
markdown_content = mk_mm_markdown(pdf_info_dict)
|
||||
# markdown_content = ocr_mk_nlp_markdown(pdf_info_dict)
|
||||
markdown_content = ocr_mk_mm_markdown(pdf_info_dict)
|
||||
|
||||
with open(text_content_save_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
@@ -156,6 +156,9 @@ def parse_pdf_by_ocr(
|
||||
int(x1 / horizontal_scale_ratio),
|
||||
int(y1 / vertical_scale_ratio),
|
||||
]
|
||||
# 删除高度或者宽度为0的spans
|
||||
if bbox[2] - bbox[0] == 0 or bbox[3] - bbox[1] == 0:
|
||||
continue
|
||||
"""要删除的"""
|
||||
# 3: 'header', # 页眉
|
||||
# 4: 'page number', # 页码
|
||||
@@ -193,6 +196,9 @@ def parse_pdf_by_ocr(
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
# 删除重叠spans中较小的那些
|
||||
spans = remove_overlaps_min_spans(spans)
|
||||
|
||||
@@ -202,7 +208,7 @@ def parse_pdf_by_ocr(
|
||||
spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
|
||||
|
||||
# 对image和table截图
|
||||
spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
|
||||
spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
|
||||
|
||||
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
|
||||
displayed_list = []
|
||||
|
||||
@@ -3,7 +3,7 @@ from magic_pdf.libs.ocr_content_type import ContentType
|
||||
from magic_pdf.libs.pdf_image_tools import cut_image
|
||||
|
||||
|
||||
def cut_image_and_table(spans, page, page_id, book_name, save_path):
|
||||
def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
|
||||
def s3_return_path(type):
|
||||
return join_path(book_name, type)
|
||||
|
||||
@@ -13,8 +13,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
|
||||
for span in spans:
|
||||
span_type = span['type']
|
||||
if span_type == ContentType.Image:
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
|
||||
elif span_type == ContentType.Table:
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
|
||||
|
||||
return spans
|
||||
|
||||
Reference in New Issue
Block a user