mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
12 Commits
release-0.
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c175001d6a | ||
|
|
a35785b9e5 | ||
|
|
a7296f78f0 | ||
|
|
ed822634df | ||
|
|
b0529b6fbd | ||
|
|
7f8dc353b0 | ||
|
|
384e03799c | ||
|
|
41545a13c6 | ||
|
|
b17084febb | ||
|
|
f11f3d6032 | ||
|
|
f8828be7eb | ||
|
|
b03a7fae5e |
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pm = doc.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# If the width or height exceeds 9000 after scaling, do not scale further.
|
||||
if pm.width > 9000 or pm.height > 9000:
|
||||
# If the width or height exceeds 4500 after scaling, do not scale further.
|
||||
if pm.width > 4500 or pm.height > 4500:
|
||||
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
||||
|
||||
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "0.10.3"
|
||||
__version__ = "0.10.4"
|
||||
|
||||
@@ -112,8 +112,8 @@ def __is_list_or_index_block(block):
|
||||
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
||||
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
||||
if (
|
||||
line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
|
||||
and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
|
||||
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
|
||||
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
|
||||
):
|
||||
external_sides_not_close_num += 1
|
||||
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
||||
|
||||
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
|
||||
for span in spans:
|
||||
span_type = span['type']
|
||||
if span_type == ContentType.Image:
|
||||
if not check_img_bbox(span['bbox']):
|
||||
if not check_img_bbox(span['bbox']) or not imageWriter:
|
||||
continue
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
|
||||
imageWriter=imageWriter)
|
||||
elif span_type == ContentType.Table:
|
||||
if not check_img_bbox(span['bbox']):
|
||||
if not check_img_bbox(span['bbox']) or not imageWriter:
|
||||
continue
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
|
||||
imageWriter=imageWriter)
|
||||
|
||||
@@ -15,7 +15,7 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
||||
from magic_pdf.libs.json_compressor import JsonCompressor
|
||||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||||
|
||||
from ..extensions import app, db
|
||||
from ..extentions import app, db
|
||||
from .ext import find_file
|
||||
from .models import AnalysisPdf, AnalysisTask
|
||||
|
||||
|
||||
Reference in New Issue
Block a user