mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-29 20:18:35 +07:00
Compare commits
10 Commits
mineru-3.0
...
dev
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
264c594b23 | ||
|
|
a6b6d3081c | ||
|
|
1f82300d1d | ||
|
|
2d4fa2cc8e | ||
|
|
71b9e9f780 | ||
|
|
5e51ab2934 | ||
|
|
520c61faaf | ||
|
|
72601b314a | ||
|
|
e54c67dcd1 | ||
|
|
33e4fbd694 |
@@ -50,7 +50,7 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Accuracy<sup>1</sup></th>
|
||||
<td style="text-align:center;">82+</td>
|
||||
<td style="text-align:center;">86+</td>
|
||||
<td colspan="4" style="text-align:center;">90+</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@@ -70,15 +70,15 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Min VRAM</th>
|
||||
<td style="text-align:center;">6GB</td>
|
||||
<td style="text-align:center;">10GB</td>
|
||||
<td style="text-align:center;">4GB</td>
|
||||
<td style="text-align:center;">8GB</td>
|
||||
<td style="text-align:center;">3GB</td>
|
||||
<td style="text-align:center;">8GB</td>
|
||||
<td style="text-align:center;">2GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>RAM</th>
|
||||
<td colspan="3" style="text-align:center;">Min 16GB+, Recommended 32GB+</td>
|
||||
<td colspan="2" style="text-align:center;">8GB</td>
|
||||
<td colspan="2" style="text-align:center;">16GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Disk Space</th>
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
</tr>
|
||||
<tr>
|
||||
<th>精度指标<sup>1</sup></th>
|
||||
<td style="text-align:center;">82+</td>
|
||||
<td style="text-align:center;">86+</td>
|
||||
<td colspan="4" style="text-align:center;">90+</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@@ -70,15 +70,15 @@
|
||||
</tr>
|
||||
<tr>
|
||||
<th>显存最低要求</th>
|
||||
<td style="text-align:center;">6GB</td>
|
||||
<td style="text-align:center;">10GB</td>
|
||||
<td style="text-align:center;">4GB</td>
|
||||
<td style="text-align:center;">8GB</td>
|
||||
<td style="text-align:center;">3GB</td>
|
||||
<td style="text-align:center;">8GB</td>
|
||||
<td style="text-align:center;">2GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>内存要求</th>
|
||||
<td colspan="3" style="text-align:center;">最低16GB以上,推荐32GB以上</td>
|
||||
<td colspan="2" style="text-align:center;">8GB</td>
|
||||
<td colspan="2" style="text-align:center;">16GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>磁盘空间要求</th>
|
||||
|
||||
@@ -131,23 +131,27 @@ def blocks_to_page_info(
|
||||
return page_info
|
||||
|
||||
|
||||
def _iter_block_spans(block):
|
||||
for line in block.get("lines", []):
|
||||
for span in line.get("spans", []):
|
||||
yield span
|
||||
|
||||
for sub_block in block.get("blocks", []):
|
||||
yield from _iter_block_spans(sub_block)
|
||||
|
||||
|
||||
def _apply_post_ocr(pdf_info_list, hybrid_pipeline_model):
|
||||
need_ocr_list = []
|
||||
img_crop_list = []
|
||||
text_block_list = []
|
||||
for page_info in pdf_info_list:
|
||||
for block in page_info['para_blocks']:
|
||||
if block['type'] in ['table', 'image', 'list', 'code']:
|
||||
for sub_block in block['blocks']:
|
||||
if not sub_block['type'].endswith('body'):
|
||||
text_block_list.append(sub_block)
|
||||
elif block['type'] in ['text', 'title', 'ref_text']:
|
||||
text_block_list.append(block)
|
||||
for block in page_info['discarded_blocks']:
|
||||
text_block_list.append(block)
|
||||
for block in text_block_list:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
for block in page_info.get('para_blocks', []):
|
||||
for span in _iter_block_spans(block):
|
||||
if 'np_img' in span:
|
||||
need_ocr_list.append(span)
|
||||
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
|
||||
span.pop('np_img')
|
||||
for block in page_info.get('discarded_blocks', []):
|
||||
for span in _iter_block_spans(block):
|
||||
if 'np_img' in span:
|
||||
need_ocr_list.append(span)
|
||||
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
|
||||
|
||||
@@ -203,6 +203,15 @@ def _extract_text_from_block(block):
|
||||
return "".join(text_parts).strip()
|
||||
|
||||
|
||||
def _iter_block_spans(block):
|
||||
for line in block.get("lines", []):
|
||||
for span in line.get("spans", []):
|
||||
yield span
|
||||
|
||||
for sub_block in block.get("blocks", []):
|
||||
yield from _iter_block_spans(sub_block)
|
||||
|
||||
|
||||
def _normalize_formula_tag_content(tag_content):
|
||||
tag_content = full_to_half(tag_content.strip())
|
||||
if tag_content.startswith("("):
|
||||
@@ -261,22 +270,18 @@ def _optimize_formula_number_blocks(pdf_info_list):
|
||||
def _apply_post_ocr(pdf_info_list, lang=None):
|
||||
need_ocr_list = []
|
||||
img_crop_list = []
|
||||
text_block_list = []
|
||||
|
||||
for page_info in pdf_info_list:
|
||||
for block in page_info['preproc_blocks']:
|
||||
if 'blocks' in block:
|
||||
for sub_block in block['blocks']:
|
||||
if sub_block.get("type", "").endswith('caption') or sub_block.get("type", "").endswith('footnote'):
|
||||
text_block_list.append(sub_block)
|
||||
elif block["type"] not in [BlockType.INTERLINE_EQUATION, BlockType.SEAL]:
|
||||
text_block_list.append(block)
|
||||
for block in page_info['discarded_blocks']:
|
||||
text_block_list.append(block)
|
||||
for block in page_info.get('preproc_blocks', []):
|
||||
for span in _iter_block_spans(block):
|
||||
if 'np_img' in span:
|
||||
need_ocr_list.append(span)
|
||||
# Keep post-OCR rec aligned with the main OCR pipeline for vertical tall crops.
|
||||
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
|
||||
span.pop('np_img')
|
||||
|
||||
for block in text_block_list:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
for block in page_info.get('discarded_blocks', []):
|
||||
for span in _iter_block_spans(block):
|
||||
if 'np_img' in span:
|
||||
need_ocr_list.append(span)
|
||||
# Keep post-OCR rec aligned with the main OCR pipeline for vertical tall crops.
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.7.6"
|
||||
__version__ = "3.0.1"
|
||||
|
||||
Reference in New Issue
Block a user