diff --git a/magic_pdf/post_proc/para_split_v3.py b/magic_pdf/post_proc/para_split_v3.py index f62143ef..5f6852a6 100644 --- a/magic_pdf/post_proc/para_split_v3.py +++ b/magic_pdf/post_proc/para_split_v3.py @@ -108,6 +108,22 @@ def __is_list_or_index_block(block): ): multiple_para_flag = True + block_text = '' + + for line in block['lines']: + line_text = '' + + for span in line['spans']: + span_type = span['type'] + if span_type == ContentType.Text: + line_text += span['content'].strip() + # 添加所有文本,包括空行,保持与block['lines']长度一致 + lines_text_list.append(line_text) + block_text = ''.join(lines_text_list) + + block_lang = detect_lang(block_text) + # logger.info(f"block_lang: {block_lang}") + for line in block['lines']: line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2 block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2 @@ -119,19 +135,6 @@ def __is_list_or_index_block(block): if abs(line_mid_x - block_mid_x) < line_height / 2: center_close_num += 1 - line_text = '' - - for span in line['spans']: - span_type = span['type'] - if span_type == ContentType.Text: - line_text += span['content'].strip() - - # 添加所有文本,包括空行,保持与block['lines']长度一致 - lines_text_list.append(line_text) - block_text = ''.join(lines_text_list) - block_lang = detect_lang(block_text) - # logger.info(f"block_lang: {block_lang}") - # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断 if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: left_close_num += 1 diff --git a/requirements.txt b/requirements.txt index 060bab2d..d51cb311 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ boto3>=1.28.43 Brotli>=1.1.0 click>=8.1.7 -fast-langdetect>=0.2.3 +fast-langdetect>=0.2.3,<0.3.0 loguru>=0.6.0 numpy>=1.21.6,<2.0.0 pydantic>=2.7.2