Compare commits

...

34 Commits

Author SHA1 Message Date
Xiaomeng Zhao
b03a7fae5e Merge pull request #1153 from opendatalab/release-0.10.4
Release 0.10.4
2024-11-30 02:47:28 +08:00
Xiaomeng Zhao
9726403c69 Merge pull request #1152 from myhloli/dev
fix(mkcontent): optimize paragraph text merging and language detection
2024-11-30 02:45:02 +08:00
myhloli
b3127233f0 refactor: modify bbox processing for layout separation
- Remove overlap between bboxes for block separation
- Sort bboxes by combined x and y coordinates for better layout handling
- Comment out previous overlap removal function
2024-11-30 02:33:26 +08:00
myhloli
b80befe9cf refactor(mkcontent): optimize paragraph text merging and language detection
- Extract language detection to block level instead of line level
- Improve logic for handling Chinese, Japanese, and Korean languages
- Refactor code for better readability and performance
- Optimize handling of hyphenated words at line ends
2024-11-30 02:16:38 +08:00
myhloli
ea35fa6b60 Merge remote-tracking branch 'origin/dev' into dev 2024-11-30 01:14:26 +08:00
myhloli
c8cabb3cf6 feat(ocr_mkcontent): add language detection for line spacing
- Introduce language detection to determine line spacing based on language context
- Implement different spacing rules for Chinese/Japanese/Korean and Western texts
- Adjust span content handling based on detected language and span type
2024-11-30 01:14:12 +08:00
Xiaomeng Zhao
78c9014073 Merge pull request #1147 from opendatalab/master
master->dev
2024-11-29 16:44:47 +08:00
myhloli
d19911f113 Update version.py with new version 2024-11-29 08:03:01 +00:00
Xiaomeng Zhao
b3fbedf055 Merge pull request #1143 from opendatalab/release-0.10.3
Release 0.10.3
2024-11-29 16:01:36 +08:00
Xiaomeng Zhao
66bd0f8b69 Merge pull request #1141 from myhloli/dev
refactor(ocr): Fix the error of paddleocr failing to initialize in a multi-threaded environment
2024-11-29 12:03:48 +08:00
myhloli
7f2f2c0f28 refactor(ocr): Fix the error of paddleocr failing to initialize in a multi-threaded environment 2024-11-29 12:02:48 +08:00
Xiaomeng Zhao
68c455309d Merge pull request #1140 from myhloli/dev
refactor(pdf_parse): adjust character-axis alignment algorithm
2024-11-29 12:00:36 +08:00
myhloli
d4345b6e39 refactor(pdf_parse): adjust character-axis alignment algorithm
- Introduce `span_height_radio` parameter to calculate_char_in_span function
- Replace fixed ratio with dynamic ratio for character and span axis alignment
- Improve flexibility and accuracy of character placement within spans
2024-11-29 11:59:52 +08:00
Xiaomeng Zhao
086b48b7ae Merge pull request #1139 from myhloli/dev
fix(ocr_mkcontent): handle empty paragraphs on pages
2024-11-29 11:59:03 +08:00
myhloli
782e6571bc fix(ocr_mkcontent): handle empty paragraphs on pages
- Add empty paragraph handling for pages with no content
- Append an empty markdown object when a page has no paragraphs
- Increment page number even if no content is present
2024-11-29 11:58:34 +08:00
Xiaomeng Zhao
4adabc37ac Merge pull request #1138 from myhloli/dev
feat(pdf_parse): add line start flag detection and optimize line stop flag logic
2024-11-28 23:13:47 +08:00
myhloli
949d0867fb feat(pdf_parse): add line start flag detection and optimize line stop flag logic
- Add LINE_START_FLAG tuple to identify starting flags of a line
- Modify calculate_char_in_span function to handle both line start and stop flags
- Remove redundant char_is_line_stop_flag variable and simplify logic
- Improve line flag detection to enhance text extraction accuracy
2024-11-28 23:12:37 +08:00
Xiaomeng Zhao
a1cff28c74 Merge pull request #1137 from myhloli/dev
refactor(pdf_check): improve character detection using PyMuPDF
2024-11-28 22:36:30 +08:00
myhloli
ac88815620 refactor(pdf_check): improve character detection using PyMuPDF
- Replace pdfminer with PyMuPDF for character detection
- Implement new method detect_invalid_chars_by_pymupdf
- Update check_invalid_chars in pdf_meta_scan.py to use new method
- Add __replace_0xfffd function in pdf_parse_union_core_v2.py to handle special characters
- Remove unused imports and update requirements.txt
2024-11-28 22:34:23 +08:00
Xiaomeng Zhao
b4dfa0f92f Merge pull request #1136 from myhloli/dev
refactor(ocr): improve text processing and span handling
2024-11-28 19:39:28 +08:00
myhloli
88c0854a65 refactor(ocr): improve text processing and span handling
- Remove unused language detection code
- Simplify text content processing logic
- Update span sorting and text extraction in pdf_parse_union_core_v2.py
2024-11-28 19:38:30 +08:00
Xiaomeng Zhao
c295587b9e Merge pull request #1135 from myhloli/dev
feat(pdf_parse): filter out skewed text lines
2024-11-28 18:53:06 +08:00
myhloli
37da8c44c4 feat(pdf_parse): filter out skewed text lines
- Add direction filtering to ignore highly skewed text lines
- Improve text extraction accuracy by focusing on non-skewed content
2024-11-28 18:52:18 +08:00
Xiaomeng Zhao
5ecafbfa7d Merge pull request #1134 from myhloli/dev
refactor(para): improve language detection and block splitting
2024-11-28 18:07:23 +08:00
myhloli
f674b8d413 refactor(para): improve language detection and block splitting
- Add language detection for each block of text
- Implement language-specific logic for right margin alignment
- Introduce logging for debugging purposes
2024-11-28 18:06:17 +08:00
Xiaomeng Zhao
e22fa18b46 Merge pull request #1132 from myhloli/dev
fix(Hybrid OCR):Enable Hybrid OCR for Empty Spans That Contain a Certain Number of Placeholders but No Actual Text
2024-11-28 15:34:00 +08:00
myhloli
08392d63a0 fix(Hybrid OCR):Enable Hybrid OCR for Empty Spans That Contain a Certain Number of Placeholders but No Actual Text 2024-11-28 15:29:42 +08:00
Xiaomeng Zhao
f09c1cd284 Merge pull request #1130 from myhloli/dev
fix(lite_model): Adapt iite Mode to the Hybrid OCR Mode in Version 0.10
2024-11-28 15:27:52 +08:00
myhloli
9b4d77dcd4 fix(lite_model): Adapt iite Mode to the Hybrid OCR Mode in Version 0.10 2024-11-28 15:06:54 +08:00
Xiaomeng Zhao
89c7bd0419 Merge pull request #1121 from opendatalab/master
master -> dev
2024-11-27 18:33:05 +08:00
myhloli
52ef1bc782 Update version.py with new version 2024-11-27 10:31:09 +00:00
Xiaomeng Zhao
8afff9aee8 Merge pull request #1120 from opendatalab/release-0.10.2
Release 0.10.2
2024-11-27 18:16:02 +08:00
yyy
4df1eb74fa Update daily.yml 2024-11-26 12:49:29 +08:00
Xiaomeng Zhao
fcfaede87b Update bug_report.yml 2024-11-25 14:39:59 +08:00
14 changed files with 224 additions and 151 deletions

View File

@@ -81,6 +81,7 @@ body:
- "0.7.x"
- "0.8.x"
- "0.9.x"
- "0.10.x"
validations:
required: true

View File

@@ -1,53 +1 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: mineru
jobs:
cli-test:
runs-on: pdf
timeout-minutes: 240
strategy:
fail-fast: true
steps:
- name: PDF cli
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: install&test
run: |
source activate mineru
conda env list
pip show coverage
git checkout "dev"
# cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing
cd $GITHUB_WORKSPACE && python tests/get_coverage.py
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: cli-test
runs-on: pdf
steps:
- name: get_actor
run: |
metion_list="dt-yy"
echo $GITHUB_ACTOR
if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
metion_list="xuchao"
elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
metion_list="zhaoxiaomeng"
elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
metion_list="xurui1"
fi
echo $metion_list
echo "METIONS=$metion_list" >> "$GITHUB_ENV"
echo ${{ env.METIONS }}
- name: notify
run: |
echo ${{ secrets.USER_ID }}
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}

View File

@@ -30,6 +30,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
for page_info in pdf_info_dict:
paras_of_layout = page_info.get('para_blocks')
if not paras_of_layout:
markdown_with_para_and_pagination.append({
'page_no':
page_no,
'md_content':
'',
})
page_no += 1
continue
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
@@ -129,21 +136,19 @@ def __replace_ligatures(text: str):
def merge_para_with_text(para_block):
block_text = ''
for line in para_block['lines']:
for span in line['spans']:
if span['type'] in [ContentType.Text]:
block_text += span['content']
block_lang = detect_lang(block_text)
para_text = ''
for i, line in enumerate(para_block['lines']):
if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
para_text += ' \n'
line_text = ''
line_lang = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
if line_text != '':
line_lang = detect_lang(line_text)
for j, span in enumerate(line['spans']):
span_type = span['type']
@@ -156,20 +161,20 @@ def merge_para_with_text(para_block):
content = f"\n$$\n{span['content']}\n$$\n"
content = content.strip()
if content != '':
if content:
langs = ['zh', 'ja', 'ko']
if line_lang in langs: # 遇到一些一个字一个span的文档这种单字语言判断不准需要用整行文本判断
if span_type in [ContentType.Text, ContentType.InterlineEquation]:
para_text += content # 中文/日语/韩文语境下content间不需要空格分隔
elif span_type == ContentType.InlineEquation:
para_text += f' {content} '
# logger.info(f'block_lang: {block_lang}, content: {content}')
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
if j == len(line['spans']) - 1:
para_text += content
else:
para_text += f'{content} '
else:
if span_type in [ContentType.Text, ContentType.InlineEquation]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
para_text += content[:-1]
elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
para_text += content
else: # 西方文本语境下 content间需要空格分隔
para_text += f'{content} '
elif span_type == ContentType.InterlineEquation:
@@ -177,7 +182,7 @@ def merge_para_with_text(para_block):
else:
continue
# 连写字符拆分
para_text = __replace_ligatures(para_text)
# para_text = __replace_ligatures(para_text)
return para_text

View File

@@ -8,7 +8,7 @@ from loguru import logger
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.commons import get_top_percent_list, mymax
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
scan_max_page = 50
junk_limit_min = 10
@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document):
def check_invalid_chars(pdf_bytes):
"""乱码检测."""
return detect_invalid_chars(pdf_bytes)
return detect_invalid_chars_by_pymupdf(pdf_bytes)
def pdf_meta_scan(pdf_bytes: bytes):

View File

@@ -1,9 +1,9 @@
from io import BytesIO
import re
import fitz
import numpy as np
from loguru import logger
from pdfminer.high_level import extract_text
# import re
# from io import BytesIO
# from pdfminer.high_level import extract_text
def calculate_sample_count(total_page: int):
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
return select_page_cnt
def extract_pages(src_pdf_bytes: bytes):
def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
pdf_docs = fitz.open("pdf", src_pdf_bytes)
total_page = len(pdf_docs)
if total_page == 0:
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
return sample_docs
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
""""
检测PDF中是否包含非法字符
# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
# """"
# 检测PDF中是否包含非法字符
# """
# '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
# sample_docs = extract_pages(src_pdf_bytes)
# sample_pdf_bytes = sample_docs.tobytes()
# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
# text = extract_text(sample_pdf_file_like_object)
# text = text.replace("\n", "")
# # logger.info(text)
# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
# cid_pattern = re.compile(r'\(cid:\d+\)')
# matches = cid_pattern.findall(text)
# cid_count = len(matches)
# cid_len = sum(len(match) for match in matches)
# text_len = len(text)
# if text_len == 0:
# cid_chars_radio = 0
# else:
# cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
# if cid_chars_radio > 0.05:
# return False # 乱码文档
# else:
# return True # 正常文档
def count_replacement_characters(text: str) -> int:
"""
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
统计字符串中 0xfffd 字符的数量。
"""
return text.count('\ufffd')
def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes()
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
text = extract_text(sample_pdf_file_like_object)
text = text.replace("\n", "")
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
cid_len = sum(len(match) for match in matches)
text_len = len(text)
doc_text = ""
for page in sample_docs:
page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
doc_text += page_text
text_len = len(doc_text)
uffd_count = count_replacement_characters(doc_text)
if text_len == 0:
cid_chars_radio = 0
uffd_chars_radio = 0
else:
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
if cid_chars_radio > 0.05:
uffd_chars_radio = uffd_count / text_len
logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
'''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
if uffd_chars_radio > 0.01:
return False # 乱码文档
else:
return True # 正常文档
return True # 正常文档

View File

@@ -1 +1 @@
__version__ = "0.10.1"
__version__ = "0.10.3"

View File

@@ -18,11 +18,31 @@ def region_to_bbox(region):
class CustomPaddleModel:
def __init__(self, ocr: bool = False, show_log: bool = False, lang=None):
def __init__(self,
ocr: bool = False,
show_log: bool = False,
lang=None,
det_db_box_thresh=0.3,
use_dilation=True,
det_db_unclip_ratio=1.8
):
if lang is not None:
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang)
self.model = PPStructure(table=False,
ocr=True,
show_log=show_log,
lang=lang,
det_db_box_thresh=det_db_box_thresh,
use_dilation=use_dilation,
det_db_unclip_ratio=det_db_unclip_ratio,
)
else:
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
self.model = PPStructure(table=False,
ocr=True,
show_log=show_log,
det_db_box_thresh=det_db_box_thresh,
use_dilation=use_dilation,
det_db_unclip_ratio=det_db_unclip_ratio,
)
def __call__(self, img):
try:

View File

@@ -1,9 +1,55 @@
import cv2
import numpy as np
from loguru import logger
from io import BytesIO
from PIL import Image
import base64
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
from ppocr.utils.utility import check_and_read
def img_decode(content: bytes):
np_arr = np.frombuffer(content, dtype=np.uint8)
return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
def check_img(img):
if isinstance(img, bytes):
img = img_decode(img)
if isinstance(img, str):
image_file = img
img, flag_gif, flag_pdf = check_and_read(image_file)
if not flag_gif and not flag_pdf:
with open(image_file, 'rb') as f:
img_str = f.read()
img = img_decode(img_str)
if img is None:
try:
buf = BytesIO()
image = BytesIO(img_str)
im = Image.open(image)
rgb = im.convert('RGB')
rgb.save(buf, 'jpeg')
buf.seek(0)
image_bytes = buf.read()
data_base64 = str(base64.b64encode(image_bytes),
encoding="utf-8")
image_decode = base64.b64decode(data_base64)
img_array = np.frombuffer(image_decode, np.uint8)
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
except:
logger.error("error in loading image:{}".format(image_file))
return None
if img is None:
logger.error("error in loading image:{}".format(image_file))
return None
if isinstance(img, np.ndarray) and len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
return img
def bbox_to_points(bbox):
""" 将bbox格式转换为四个顶点的数组 """

View File

@@ -1,15 +1,17 @@
import copy
import time
import cv2
import numpy as np
from paddleocr import PaddleOCR
from paddleocr.paddleocr import check_img, logger
from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
from paddleocr.tools.infer.predict_system import sorted_boxes
from paddleocr.tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes
from paddleocr import PaddleOCR
from ppocr.utils.logging import get_logger
from ppocr.utils.utility import alpha_to_color, binarize_img
from tools.infer.predict_system import sorted_boxes
from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
logger = get_logger()
class ModifiedPaddleOCR(PaddleOCR):

View File

@@ -2,8 +2,8 @@ import os
import cv2
import numpy as np
from paddleocr.ppstructure.table.predict_table import TableSystem
from paddleocr.ppstructure.utility import init_args
from ppstructure.table.predict_table import TableSystem
from ppstructure.utility import init_args
from PIL import Image
from magic_pdf.config.constants import * # noqa: F403

View File

@@ -1,7 +1,10 @@
import copy
from loguru import logger
from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.language import detect_lang
LINE_STOP_FLAG = (
'.',
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
# 添加所有文本包括空行保持与block['lines']长度一致
lines_text_list.append(line_text)
block_text = ''.join(lines_text_list)
block_lang = detect_lang(block_text)
# logger.info(f"block_lang: {block_lang}")
# 计算line左侧顶格数量是否大于2是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
right_close_num += 1
else:
# 右侧不顶格情况下是否有一段距离拍脑袋用0.3block宽度做阈值
# block宽的阈值可以小些block窄的阈值要大
if block_weight_radio >= 0.5:
# 类中文没有超长单词的情况,可以用统一的阈值
if block_lang in ['zh', 'ja', 'ko']:
closed_area = 0.26 * block_weight
else:
closed_area = 0.36 * block_weight
# 右侧不顶格情况下是否有一段距离拍脑袋用0.3block宽度做阈值
# block宽的阈值可以小些block窄的阈值要大
if block_weight_radio >= 0.5:
closed_area = 0.26 * block_weight
else:
closed_area = 0.36 * block_weight
if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
right_not_close_num += 1

View File

@@ -30,22 +30,14 @@ try:
torchtext.disable_torchtext_deprecation_warning()
except ImportError:
pass
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.para.para_split_v3 import para_split
from magic_pdf.pre_proc.construct_page_dict import \
ocr_construct_page_component_v2
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
fix_block_spans_v2,
fix_discarded_block)
from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans)
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
def __replace_STX_ETX(text_str: str):
@@ -65,10 +57,18 @@ def __replace_STX_ETX(text_str: str):
return text_str
def __replace_0xfffd(text_str: str):
"""Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
if text_str:
s = text_str.replace('\ufffd', " ")
return s
return text_str
def chars_to_content(span):
# 检查span中的char是否为空
if len(span['chars']) == 0:
span['content'] = ''
pass
# span['content'] = ''
else:
# 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
@@ -83,22 +83,24 @@ def chars_to_content(span):
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
content += ' '
content += char['c']
span['content'] = __replace_STX_ETX(content)
span['content'] = __replace_0xfffd(content)
del span['chars']
LINE_STOP_FLAG = ('.', '!', '?', '', '', '', ')', '', '"', '', ':', '', ';', '', ']', '', '}', '}', '>', '', '', ',', '', '-', '', '',)
LINE_START_FLAG = ('(', '', '"', '', '', '{', '', '<', '', '', '', '[',)
def fill_char_in_spans(spans, all_chars):
# 简单从上到下排一下序
spans = sorted(spans, key=lambda x: x['bbox'][1])
for char in all_chars:
for span in spans:
# 判断char是否属于LINE_STOP_FLAG
if char['c'] in LINE_STOP_FLAG:
char_is_line_stop_flag = True
else:
char_is_line_stop_flag = False
if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
span['chars'].append(char)
break
@@ -106,13 +108,16 @@ def fill_char_in_spans(spans, all_chars):
for span in spans:
chars_to_content(span)
if len(span['content']) == 0:
# 有的span中虽然没有字但有一两个空的占位符用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5:
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
empty_spans.append(span)
del span['height'], span['width']
return empty_spans
# 使用鲁棒性更强的中心点坐标判断
def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
@@ -121,18 +126,26 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
if (
span_bbox[0] < char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度
and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
):
return True
else:
# 如果char是LINE_STOP_FLAG就不用中心点判定换一种方案左边界在span区域内高度判定和之前逻辑一致
# 主要是给结尾符号一个进入span的机会这个char还应该离span右边界较近
if char_is_line_stop_flag:
if char in LINE_STOP_FLAG:
if (
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
and char_center_x > span_bbox[0]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height / 4
and abs(char_center_y - span_center_y) < span_height * span_height_radio
):
return True
elif char in LINE_START_FLAG:
if (
span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
and char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio
):
return True
else:
@@ -141,12 +154,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars = []
for block in text_blocks_raw:
for line in block['lines']:
cosine, sine = line['dir']
if abs (cosine) < 0.9 or abs(sine) > 0.1:
continue
for span in line['spans']:
all_pymu_chars.extend(span['chars'])
@@ -157,6 +172,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
continue
span_height = span['bbox'][3] - span['bbox'][1]
span['height'] = span_height
span['width'] = span['bbox'][2] - span['bbox'][0]
span_height_list.append(span_height)
if len(span_height_list) == 0:
return spans
@@ -174,15 +190,13 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
continue
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
if span['height'] > median_span_height * 3 and span['height'] > (span['bbox'][2] - span['bbox'][0]) * 3:
if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
vertical_spans.append(span)
elif block in all_bboxes:
useful_spans.append(span)
else:
unuseful_spans.append(span)
del span['height']
break
"""垂直的span框直接用pymu的line进行填充"""
@@ -232,6 +246,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0:
ocr_text, ocr_score = ocr_res[0][0]
# logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
if ocr_score > 0.5 and len(ocr_text) > 0:
span['content'] = ocr_text
span['score'] = ocr_score

View File

@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
"""将剩余的bbox做分离处理防止后面分layout时出错"""
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes.sort(key=lambda x: x[0]+x[1])
return all_bboxes, all_discarded_blocks

View File

@@ -4,10 +4,10 @@ click>=8.1.7
fast-langdetect==0.2.0
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
pdfminer.six==20231228
pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9
scikit-learn>=1.0.2
torch>=2.2.2,<=2.3.1
transformers
# pdfminer.six==20231228
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.