mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pm = doc.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# If the width or height exceeds 9000 after scaling, do not scale further.
|
||||
if pm.width > 9000 or pm.height > 9000:
|
||||
# If the width or height exceeds 4500 after scaling, do not scale further.
|
||||
if pm.width > 4500 or pm.height > 4500:
|
||||
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
||||
|
||||
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
||||
|
||||
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
|
||||
for span in spans:
|
||||
span_type = span['type']
|
||||
if span_type == ContentType.Image:
|
||||
if not check_img_bbox(span['bbox']):
|
||||
if not check_img_bbox(span['bbox']) or not imageWriter:
|
||||
continue
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
|
||||
imageWriter=imageWriter)
|
||||
elif span_type == ContentType.Table:
|
||||
if not check_img_bbox(span['bbox']):
|
||||
if not check_img_bbox(span['bbox']) or not imageWriter:
|
||||
continue
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
|
||||
imageWriter=imageWriter)
|
||||
|
||||
Reference in New Issue
Block a user