mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
2 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a01356400e | ||
|
|
f10b4a501f |
@@ -1,3 +1,4 @@
|
||||
from magic_pdf.libs.commons import s3_image_save_path, join_path
|
||||
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
|
||||
@@ -42,7 +43,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
|
||||
if not span.get('image_path'):
|
||||
continue
|
||||
else:
|
||||
content = f""
|
||||
content = f"})"
|
||||
else:
|
||||
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
|
||||
if span['type'] == ContentType.InlineEquation:
|
||||
@@ -73,7 +74,7 @@ def mk_mm_markdown2(pdf_info_dict:dict):
|
||||
elif span_type == ContentType.InterlineEquation:
|
||||
para_text += f"$$\n{span['content']}\n$$ "
|
||||
elif span_type == ContentType.Image:
|
||||
para_text += f" "
|
||||
para_text += f"})"
|
||||
markdown.append(para_text)
|
||||
|
||||
return '\n\n'.join(markdown)
|
||||
|
||||
@@ -24,6 +24,8 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
|
||||
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
|
||||
json_dump_path = "s3://llm-pdf-text/json_dump/"
|
||||
|
||||
s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/"
|
||||
|
||||
|
||||
def get_top_percent_list(num_list, percent):
|
||||
"""
|
||||
|
||||
@@ -4,7 +4,7 @@ import time
|
||||
from urllib.parse import quote
|
||||
|
||||
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
|
||||
from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time
|
||||
from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time, s3_image_save_path
|
||||
from magic_pdf.libs.drop_reason import DropReason
|
||||
from magic_pdf.libs.json_compressor import JsonCompressor
|
||||
from magic_pdf.dict2md.mkcontent import mk_nlp_markdown
|
||||
@@ -287,7 +287,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
||||
# jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
|
||||
else:
|
||||
try:
|
||||
save_path = "s3://mllm-raw-media/pdf2md_img/"
|
||||
save_path = s3_image_save_path
|
||||
image_s3_config = get_s3_config(save_path)
|
||||
start_time = time.time() # 记录开始时间
|
||||
# 先打印一下book_name和解析开始的时间
|
||||
@@ -328,7 +328,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
||||
file_id = jso.get('file_id')
|
||||
book_name = f"{data_source}/{file_id}"
|
||||
try:
|
||||
save_path = "s3://mllm-raw-media/pdf2md_img/"
|
||||
save_path = s3_image_save_path
|
||||
image_s3_config = get_s3_config(save_path)
|
||||
start_time = time.time() # 记录开始时间
|
||||
# 先打印一下book_name和解析开始的时间
|
||||
|
||||
@@ -24,34 +24,37 @@ def line_sort_spans_by_left_to_right(lines):
|
||||
return line_objects
|
||||
|
||||
def merge_spans_to_line(spans):
|
||||
# 按照y0坐标排序
|
||||
spans.sort(key=lambda span: span['bbox'][1])
|
||||
if len(spans) == 0:
|
||||
return []
|
||||
else:
|
||||
# 按照y0坐标排序
|
||||
spans.sort(key=lambda span: span['bbox'][1])
|
||||
|
||||
lines = []
|
||||
current_line = [spans[0]]
|
||||
for span in spans[1:]:
|
||||
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
||||
# image和table类型,同上
|
||||
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
|
||||
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
|
||||
# 则开始新行
|
||||
lines = []
|
||||
current_line = [spans[0]]
|
||||
for span in spans[1:]:
|
||||
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
||||
# image和table类型,同上
|
||||
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
|
||||
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
|
||||
# 则开始新行
|
||||
lines.append(current_line)
|
||||
current_line = [span]
|
||||
continue
|
||||
|
||||
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
||||
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
||||
current_line.append(span)
|
||||
else:
|
||||
# 否则,开始新行
|
||||
lines.append(current_line)
|
||||
current_line = [span]
|
||||
|
||||
# 添加最后一行
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
current_line = [span]
|
||||
continue
|
||||
|
||||
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
||||
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
||||
current_line.append(span)
|
||||
else:
|
||||
# 否则,开始新行
|
||||
lines.append(current_line)
|
||||
current_line = [span]
|
||||
|
||||
# 添加最后一行
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
|
||||
return lines
|
||||
return lines
|
||||
|
||||
def merge_spans_to_line_by_layout(spans, layout_bboxes):
|
||||
lines = []
|
||||
|
||||
@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans):
|
||||
|
||||
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
|
||||
# displayed_list = []
|
||||
# 如果spans为空,则不处理
|
||||
if len(spans) == 0:
|
||||
pass
|
||||
else:
|
||||
spans.sort(key=lambda span: span['bbox'][1])
|
||||
|
||||
spans.sort(key=lambda span: span['bbox'][1])
|
||||
lines = []
|
||||
current_line = [spans[0]]
|
||||
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
||||
displayed_list.append(spans[0])
|
||||
|
||||
lines = []
|
||||
current_line = [spans[0]]
|
||||
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
||||
displayed_list.append(spans[0])
|
||||
line_first_y0 = spans[0]["bbox"][1]
|
||||
line_first_y = spans[0]["bbox"][3]
|
||||
# 用于给行间公式搜索
|
||||
# text_inline_lines = []
|
||||
for span in spans[1:]:
|
||||
# if span.get("content","") == "78.":
|
||||
# print("debug")
|
||||
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
||||
# image和table类型,同上
|
||||
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
|
||||
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
|
||||
# 传入
|
||||
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
||||
displayed_list.append(span)
|
||||
# 则开始新行
|
||||
lines.append(current_line)
|
||||
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
|
||||
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
||||
current_line = [span]
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
continue
|
||||
|
||||
line_first_y0 = spans[0]["bbox"][1]
|
||||
line_first_y = spans[0]["bbox"][3]
|
||||
# 用于给行间公式搜索
|
||||
# text_inline_lines = []
|
||||
for span in spans[1:]:
|
||||
# if span.get("content","") == "78.":
|
||||
# print("debug")
|
||||
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
||||
# image和table类型,同上
|
||||
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
|
||||
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
|
||||
# 传入
|
||||
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
||||
displayed_list.append(span)
|
||||
# 则开始新行
|
||||
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
||||
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
||||
if span["type"] == "text":
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
current_line.append(span)
|
||||
|
||||
else:
|
||||
# 否则,开始新行
|
||||
lines.append(current_line)
|
||||
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
||||
current_line = [span]
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
|
||||
# 添加最后一行
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
|
||||
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
||||
current_line = [span]
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
continue
|
||||
for line in text_inline_lines:
|
||||
# 按照x0坐标排序
|
||||
current_line = line[0]
|
||||
current_line.sort(key=lambda span: span['bbox'][0])
|
||||
|
||||
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
||||
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
||||
if span["type"] == "text":
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
current_line.append(span)
|
||||
# 调整每一个文字行内bbox统一
|
||||
for line in text_inline_lines:
|
||||
current_line, (line_first_y0, line_first_y) = line
|
||||
for span in current_line:
|
||||
span["bbox"][1] = line_first_y0
|
||||
span["bbox"][3] = line_first_y
|
||||
|
||||
else:
|
||||
# 否则,开始新行
|
||||
lines.append(current_line)
|
||||
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
||||
current_line = [span]
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
|
||||
# 添加最后一行
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
|
||||
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
||||
for line in text_inline_lines:
|
||||
# 按照x0坐标排序
|
||||
current_line = line[0]
|
||||
current_line.sort(key=lambda span: span['bbox'][0])
|
||||
|
||||
# 调整每一个文字行内bbox统一
|
||||
for line in text_inline_lines:
|
||||
current_line, (line_first_y0, line_first_y) = line
|
||||
for span in current_line:
|
||||
span["bbox"][1] = line_first_y0
|
||||
span["bbox"][3] = line_first_y
|
||||
|
||||
# return spans, displayed_list, text_inline_lines
|
||||
# return spans, displayed_list, text_inline_lines
|
||||
|
||||
|
||||
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
|
||||
|
||||
Reference in New Issue
Block a user