Compare commits

...

10 Commits

Author SHA1 Message Date
Xiaomeng Zhao
264c594b23 Merge pull request #4677 from opendatalab/master
master->dev
2026-03-29 13:33:19 +08:00
myhloli
a6b6d3081c Update version.py with new version 2026-03-29 05:28:26 +00:00
Xiaomeng Zhao
1f82300d1d Merge pull request #4676 from opendatalab/dev
Dev
2026-03-29 13:24:20 +08:00
Xiaomeng Zhao
2d4fa2cc8e Merge pull request #4675 from myhloli/dev
feat: refactor OCR processing to improve span handling and reduce cod…
2026-03-29 13:23:04 +08:00
myhloli
71b9e9f780 feat: refactor OCR processing to improve span handling and reduce code duplication 2026-03-29 13:21:13 +08:00
Xiaomeng Zhao
5e51ab2934 Merge pull request #4674 from opendatalab/master
master->dev
2026-03-29 04:18:37 +08:00
Xiaomeng Zhao
520c61faaf Merge pull request #4673 from myhloli/dev
Dev
2026-03-29 04:17:57 +08:00
myhloli
72601b314a feat: update minimum hardware requirements in index.md for clarity and accuracy 2026-03-29 04:14:24 +08:00
myhloli
e54c67dcd1 feat: update minimum hardware requirements in index.md for clarity and accuracy 2026-03-29 04:13:48 +08:00
myhloli
33e4fbd694 Update version.py with new version 2026-03-28 20:11:08 +00:00
5 changed files with 46 additions and 37 deletions

View File

@@ -50,7 +50,7 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
</tr>
<tr>
<th>Accuracy<sup>1</sup></th>
<td style="text-align:center;">82+</td>
<td style="text-align:center;">86+</td>
<td colspan="4" style="text-align:center;">90+</td>
</tr>
<tr>
@@ -70,15 +70,15 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
</tr>
<tr>
<th>Min VRAM</th>
<td style="text-align:center;">6GB</td>
<td style="text-align:center;">10GB</td>
<td style="text-align:center;">4GB</td>
<td style="text-align:center;">8GB</td>
<td style="text-align:center;">3GB</td>
<td style="text-align:center;">8GB</td>
<td style="text-align:center;">2GB</td>
</tr>
<tr>
<th>RAM</th>
<td colspan="3" style="text-align:center;">Min 16GB+, Recommended 32GB+</td>
<td colspan="2" style="text-align:center;">8GB</td>
<td colspan="2" style="text-align:center;">16GB</td>
</tr>
<tr>
<th>Disk Space</th>

View File

@@ -50,7 +50,7 @@
</tr>
<tr>
<th>精度指标<sup>1</sup></th>
<td style="text-align:center;">82+</td>
<td style="text-align:center;">86+</td>
<td colspan="4" style="text-align:center;">90+</td>
</tr>
<tr>
@@ -70,15 +70,15 @@
</tr>
<tr>
<th>显存最低要求</th>
<td style="text-align:center;">6GB</td>
<td style="text-align:center;">10GB</td>
<td style="text-align:center;">4GB</td>
<td style="text-align:center;">8GB</td>
<td style="text-align:center;">3GB</td>
<td style="text-align:center;">8GB</td>
<td style="text-align:center;">2GB</td>
</tr>
<tr>
<th>内存要求</th>
<td colspan="3" style="text-align:center;">最低16GB以上,推荐32GB以上</td>
<td colspan="2" style="text-align:center;">8GB</td>
<td colspan="2" style="text-align:center;">16GB</td>
</tr>
<tr>
<th>磁盘空间要求</th>

View File

@@ -131,23 +131,27 @@ def blocks_to_page_info(
return page_info
def _iter_block_spans(block):
for line in block.get("lines", []):
for span in line.get("spans", []):
yield span
for sub_block in block.get("blocks", []):
yield from _iter_block_spans(sub_block)
def _apply_post_ocr(pdf_info_list, hybrid_pipeline_model):
need_ocr_list = []
img_crop_list = []
text_block_list = []
for page_info in pdf_info_list:
for block in page_info['para_blocks']:
if block['type'] in ['table', 'image', 'list', 'code']:
for sub_block in block['blocks']:
if not sub_block['type'].endswith('body'):
text_block_list.append(sub_block)
elif block['type'] in ['text', 'title', 'ref_text']:
text_block_list.append(block)
for block in page_info['discarded_blocks']:
text_block_list.append(block)
for block in text_block_list:
for line in block['lines']:
for span in line['spans']:
for block in page_info.get('para_blocks', []):
for span in _iter_block_spans(block):
if 'np_img' in span:
need_ocr_list.append(span)
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
span.pop('np_img')
for block in page_info.get('discarded_blocks', []):
for span in _iter_block_spans(block):
if 'np_img' in span:
need_ocr_list.append(span)
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))

View File

@@ -203,6 +203,15 @@ def _extract_text_from_block(block):
return "".join(text_parts).strip()
def _iter_block_spans(block):
for line in block.get("lines", []):
for span in line.get("spans", []):
yield span
for sub_block in block.get("blocks", []):
yield from _iter_block_spans(sub_block)
def _normalize_formula_tag_content(tag_content):
tag_content = full_to_half(tag_content.strip())
if tag_content.startswith("("):
@@ -261,22 +270,18 @@ def _optimize_formula_number_blocks(pdf_info_list):
def _apply_post_ocr(pdf_info_list, lang=None):
need_ocr_list = []
img_crop_list = []
text_block_list = []
for page_info in pdf_info_list:
for block in page_info['preproc_blocks']:
if 'blocks' in block:
for sub_block in block['blocks']:
if sub_block.get("type", "").endswith('caption') or sub_block.get("type", "").endswith('footnote'):
text_block_list.append(sub_block)
elif block["type"] not in [BlockType.INTERLINE_EQUATION, BlockType.SEAL]:
text_block_list.append(block)
for block in page_info['discarded_blocks']:
text_block_list.append(block)
for block in page_info.get('preproc_blocks', []):
for span in _iter_block_spans(block):
if 'np_img' in span:
need_ocr_list.append(span)
# Keep post-OCR rec aligned with the main OCR pipeline for vertical tall crops.
img_crop_list.append(rotate_vertical_crop_if_needed(span['np_img']))
span.pop('np_img')
for block in text_block_list:
for line in block['lines']:
for span in line['spans']:
for block in page_info.get('discarded_blocks', []):
for span in _iter_block_spans(block):
if 'np_img' in span:
need_ocr_list.append(span)
# Keep post-OCR rec aligned with the main OCR pipeline for vertical tall crops.

View File

@@ -1 +1 @@
__version__ = "2.7.6"
__version__ = "3.0.1"