From c660fdc8f0be3f7932616b5e757ec00a7543a430 Mon Sep 17 00:00:00 2001
From: myhloli <moe@myhloli.com>
Date: Tue, 24 Dec 2024 14:33:35 +0800
Subject: [PATCH] feat(llm): add LLM-aided formula and text correction

- Add LLM-aided formula and text correction functionality
- Update config reader to include LLM-aided settings
- Create new LLM-aided processing module
- Update main processing script to incorporate LLM-aided corrections
- Modify download scripts to check for new config version
---
 magic-pdf.template.json                       | 16 ++++-
 magic_pdf/dict2md/ocr_mkcontent.py            |  2 +-
 magic_pdf/libs/config_reader.py               |  9 +++
 magic_pdf/para/__init__.py                    |  0
 magic_pdf/pdf_parse_union_core_v2.py          | 17 ++++-
 magic_pdf/post_proc/llm_aided.py              | 64 +++++++++++++++++++
 .../{para => post_proc}/para_split_v3.py      |  0
 scripts/download_models.py                    |  2 +-
 scripts/download_models_hf.py                 |  2 +-
 9 files changed, 106 insertions(+), 6 deletions(-)
 delete mode 100644 magic_pdf/para/__init__.py
 create mode 100644 magic_pdf/post_proc/llm_aided.py
 rename magic_pdf/{para => post_proc}/para_split_v3.py (100%)

diff --git a/magic-pdf.template.json b/magic-pdf.template.json
index cdb3dab6..23d781aa 100644
--- a/magic-pdf.template.json
+++ b/magic-pdf.template.json
@@ -19,5 +19,19 @@
         "enable": false,
         "max_time": 400
     },
-    "config_version": "1.0.0"
+    "llm-aided-config": {
+        "formula_aided": {
+            "api_key": "your_api_key",
+            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            "model": "qwen2.5-72b-instruct",
+            "enable": false
+        },
+        "text_aided": {
+            "api_key": "your_api_key",
+            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            "model": "qwen2.5-7b-instruct",
+            "enable": false
+        }
+    },
+    "config_version": "1.1.0"
 }
\ No newline at end of file
diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
index 6feb35d9..9563ab08 100644
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -7,7 +7,7 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
-from magic_pdf.para.para_split_v3 import ListLineTag
+from magic_pdf.post_proc.para_split_v3 import ListLineTag
 
 
 def __is_hyphen_at_line_end(line):
diff --git a/magic_pdf/libs/config_reader.py b/magic_pdf/libs/config_reader.py
index 5ab0f5d4..1253ac68 100644
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
@@ -116,6 +116,15 @@ def get_formula_config():
     else:
         return formula_config
 
+def get_llm_aided_config():
+    config = read_config()
+    llm_aided_config = config.get('llm-aided-config')
+    if llm_aided_config is None:
+        logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
+        return None
+    else:
+        return llm_aided_config
+
 
 if __name__ == '__main__':
     ak, sk, endpoint = get_s3_config('llm-raw')
diff --git a/magic_pdf/para/__init__.py b/magic_pdf/para/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py
index aa6db3c3..aa866d93 100644
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -14,11 +14,12 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.data.dataset import Dataset, PageableData
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from magic_pdf.libs.clean_memory import clean_memory
-from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
+from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_llm_aided_config
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
 from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text
 
 try:
     import torchtext
@@ -29,7 +30,7 @@ except ImportError:
     pass
 
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
-from magic_pdf.para.para_split_v3 import para_split
+from magic_pdf.post_proc.para_split_v3 import para_split
 from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
 from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
@@ -828,6 +829,18 @@ def pdf_parse_union(
     """分段"""
     para_split(pdf_info_dict)
 
+    """llm优化"""
+    llm_aided_config = get_llm_aided_config()
+    if llm_aided_config is not None:
+        """公式优化"""
+        formula_aided_config = llm_aided_config.get('formula_aided', None)
+        if formula_aided_config is not None:
+            llm_aided_formula(pdf_info_dict, formula_aided_config)
+        """文本优化"""
+        text_aided_config = llm_aided_config.get('text_aided', None)
+        if text_aided_config is not None:
+            llm_aided_text(pdf_info_dict, text_aided_config)
+
     """dict转list"""
     pdf_info_list = dict_to_list(pdf_info_dict)
     new_pdf_info_dict = {
diff --git a/magic_pdf/post_proc/llm_aided.py b/magic_pdf/post_proc/llm_aided.py
new file mode 100644
index 00000000..3b975a7a
--- /dev/null
+++ b/magic_pdf/post_proc/llm_aided.py
@@ -0,0 +1,64 @@
+# Copyright (c) Opendatalab. All rights reserved.
+
+formula_correction_prompt = """请根据以下指南修正LaTeX公式的错误，确保公式能够渲染且符合原始内容：
+
+1. 修正渲染或编译错误：
+    - Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
+    - 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误
+
+2. 保留原始信息：
+   - 保留原始公式中的所有重要信息
+   - 不要添加任何原始公式中没有的新信息
+
+IMPORTANT:请仅返回修正后的公式，不要包含任何介绍、解释或元数据。
+
+LaTeX recognition result:
+$FORMULA
+
+Your corrected result:
+"""
+
+text_correction_prompt = f"""请根据以下指南修正OCR引起的错误，确保文本连贯并符合原始内容：
+
+1. 修正OCR引起的拼写错误和错误：
+   - 修正常见的OCR错误（例如，'rn' 被误读为 'm'）
+   - 使用上下文和常识进行修正
+   - 只修正明显的错误，不要不必要的修改内容
+   - 不要添加额外的句号或其他不必要的标点符号
+
+2. 保持原始结构：
+   - 保留所有标题和子标题
+
+3. 保留原始内容：
+   - 保留原始文本中的所有重要信息
+   - 不要添加任何原始文本中没有的新信息
+   - 保留段落之间的换行符
+
+4. 保持连贯性：
+   - 确保内容与前文顺畅连接
+   - 适当处理在句子中间开始或结束的文本
+   
+5. 修正行内公式：
+   - 去除行内公式前后多余的空格
+   - 修正公式中的OCR错误
+   - 确保公式能够通过KaTeX渲染
+   
+6. 修正全角字符
+    - 修正全角标点符号为半角标点符号
+    - 修正全角字母为半角字母
+    - 修正全角数字为半角数字
+
+IMPORTANT:请仅返回修正后的文本，保留所有原始格式，包括换行符。不要包含任何介绍、解释或元数据。
+
+Previous context:
+
+Current chunk to process:
+
+Corrected text:
+"""
+
+def llm_aided_formula(pdf_info_dict, formula_aided_config):
+    pass
+
+def llm_aided_text(pdf_info_dict, text_aided_config):
+    pass
\ No newline at end of file
diff --git a/magic_pdf/para/para_split_v3.py b/magic_pdf/post_proc/para_split_v3.py
similarity index 100%
rename from magic_pdf/para/para_split_v3.py
rename to magic_pdf/post_proc/para_split_v3.py
diff --git a/scripts/download_models.py b/scripts/download_models.py
index 23e07608..73e61136 100644
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
@@ -16,7 +16,7 @@ def download_and_modify_json(url, local_filename, modifications):
     if os.path.exists(local_filename):
         data = json.load(open(local_filename))
         config_version = data.get('config_version', '0.0.0')
-        if config_version < '1.0.0':
+        if config_version < '1.1.0':
             data = download_json(url)
     else:
         data = download_json(url)
diff --git a/scripts/download_models_hf.py b/scripts/download_models_hf.py
index e2af5a09..c2b944a5 100644
--- a/scripts/download_models_hf.py
+++ b/scripts/download_models_hf.py
@@ -16,7 +16,7 @@ def download_and_modify_json(url, local_filename, modifications):
     if os.path.exists(local_filename):
         data = json.load(open(local_filename))
         config_version = data.get('config_version', '0.0.0')
-        if config_version < '1.0.0':
+        if config_version < '1.1.0':
             data = download_json(url)
     else:
         data = download_json(url)