diff --git a/README.md b/README.md
index 6c04379f..99d47e0a 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
- Preserve the structure of the original document, including headings, paragraphs, lists, etc.
- Extract images, image descriptions, tables, table titles, and footnotes.
- Automatically recognize and convert formulas in the document to LaTeX format.
-- Automatically recognize and convert tables in the document to LaTeX or HTML format.
+- Automatically recognize and convert tables in the document to HTML format.
- Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
- OCR supports detection and recognition of 84 languages.
- Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
@@ -185,17 +185,11 @@ There are three different ways to experience MinerU:
| GPU Hardware Support List |
- Minimum Requirement 8G+ VRAM |
- 3060ti/3070/4060
- 8G VRAM enables layout, formula recognition acceleration and OCR acceleration |
+ GPU VRAM 8GB or more |
+ 2080~2080Ti / 3060Ti~3090Ti / 4060~4090
+ 8G VRAM can enable all acceleration features |
None |
-
- | Recommended Configuration 10G+ VRAM |
- 3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090
- 10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
- |
-
### Online Demo
@@ -247,7 +241,7 @@ You can modify certain configurations in this file to enable or disable features
"enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
},
"table-config": {
- "model": "rapid_table", // When using structEqTable, please change to "struct_eqtable".
+ "model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
"enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
"max_time": 400
}
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 4d8aec0d..e1648dbd 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -121,7 +121,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
- 保留原文档的结构,包括标题、段落、列表等
- 提取图像、图片描述、表格、表格标题及脚注
- 自动识别并转换文档中的公式为LaTeX格式
-- 自动识别并转换文档中的表格为LaTeX或HTML格式
+- 自动识别并转换文档中的表格为HTML格式
- 自动检测扫描版PDF和乱码PDF,并启用OCR功能
- OCR支持84种语言的检测与识别
- 支持多种输出格式,如多模态与NLP的Markdown、按阅读顺序排序的JSON、含有丰富信息的中间格式等
@@ -186,17 +186,13 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
| GPU硬件支持列表 |
- 最低要求 8G+显存 |
- 3060ti/3070/4060
- 8G显存可开启全部加速功能(表格仅限rapid_table) |
+ 显存8G以上 |
+
+ 2080~2080Ti / 3060Ti~3090Ti / 4060~4090
+ 8G显存及以上可开启全部加速功能 |
None |
-
- | 推荐配置 10G+显存 |
- 3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090
- 10G显存及以上可开启全部加速功能
- |
-
+
### 在线体验
@@ -251,7 +247,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
"enable": true // 公式识别功能默认是开启的,如果需要关闭请修改此处的值为"false"
},
"table-config": {
- "model": "rapid_table", // 使用structEqTable请修改为"struct_eqtable"
+ "model": "rapid_table", // 默认使用"rapid_table",可以切换为"tablemaster"和"struct_eqtable"
"enable": false, // 表格识别功能默认是关闭的,如果需要开启请修改此处的值为"true"
"max_time": 400
}
diff --git a/projects/web_demo/web_demo/api/analysis/pdf_ext.py b/projects/web_demo/web_demo/api/analysis/pdf_ext.py
index b3a4a98d..1796677a 100644
--- a/projects/web_demo/web_demo/api/analysis/pdf_ext.py
+++ b/projects/web_demo/web_demo/api/analysis/pdf_ext.py
@@ -1,5 +1,7 @@
import json
import re
+import os
+import shutil
import traceback
from pathlib import Path
from flask import current_app, url_for
@@ -7,7 +9,7 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.pipe.UNIPipe import UNIPipe
import magic_pdf.model as model_config
from magic_pdf.libs.json_compressor import JsonCompressor
-from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para_and_pagination
+from common.mk_markdown.mk_markdown import ocr_mk_mm_markdown_with_para_and_pagination
from .ext import find_file
from ..extentions import app, db
from .models import AnalysisPdf, AnalysisTask
@@ -17,7 +19,7 @@ from loguru import logger
model_config.__use_inside_model__ = True
-def analysis_pdf(image_dir, pdf_bytes, is_ocr=False):
+def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
try:
model_json = [] # model_json传空list使用内置模型解析
logger.info(f"is_ocr: {is_ocr}")
@@ -40,7 +42,7 @@ def analysis_pdf(image_dir, pdf_bytes, is_ocr=False):
pipe.pipe_parse()
pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
pdf_info_list = pdf_mid_data["pdf_info"]
- md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_dir),
+ md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
ensure_ascii=False)
bbox_info = get_bbox_info(pdf_info_list)
return md_content, bbox_info
@@ -77,20 +79,22 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
logger.info(f"image_dir: {image_dir}")
if not Path(image_dir).exists():
Path(image_dir).mkdir(parents=True, exist_ok=True)
+ else:
+ # 清空image_dir,避免同文件多次解析图片积累
+ shutil.rmtree(image_dir, ignore_errors=True)
+ os.makedirs(image_dir, exist_ok=True)
+
+ # 获取文件内容
with open(pdf_path, 'rb') as file:
pdf_bytes = file.read()
- md_content, bbox_info = analysis_pdf(image_dir, pdf_bytes, is_ocr)
- img_list = Path(image_dir).glob('*') if Path(image_dir).exists() else []
-
- pdf_name = Path(pdf_path).name
+ # 生成图片链接
with app.app_context():
- for img in img_list:
- img_name = Path(img).name
- regex = re.compile(fr'.*\((.*?{img_name})')
- regex_result = regex.search(md_content)
- if regex_result:
- img_url = url_for('analysis.imgview', filename=img_name, as_attachment=False)
- md_content = md_content.replace(regex_result.group(1), f"{img_url}&pdf={pdf_name}")
+ image_url_prefix = f"http://{current_app.config['SERVER_NAME']}{current_app.config['FILE_API']}&pdf={Path(pdf_path).name}&filename="
+ # 解析文件
+ md_content, bbox_info = analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr)
+
+ # ############ markdown #############
+ pdf_name = Path(pdf_path).name
full_md_content = ""
for item in json.loads(md_content):
diff --git a/projects/web_demo/web_demo/app.py b/projects/web_demo/web_demo/app.py
index d98c0e56..61a01559 100644
--- a/projects/web_demo/web_demo/app.py
+++ b/projects/web_demo/web_demo/app.py
@@ -42,7 +42,7 @@ if database:
ip_address = get_local_ip()
port = config.get("PORT", 5559)
# 配置 SERVER_NAME
-config['SERVER_NAME'] = f'{ip_address}:5559'
+config['SERVER_NAME'] = f'{ip_address}:{port}'
# 配置 APPLICATION_ROOT
config['APPLICATION_ROOT'] = '/'
# 配置 PREFERRED_URL_SCHEME
diff --git a/projects/web_demo/web_demo/common/mk_markdown/__init__.py b/projects/web_demo/web_demo/common/mk_markdown/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py b/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/language.py b/projects/web_demo/web_demo/common/mk_markdown/libs/language.py
new file mode 100644
index 00000000..396c9008
--- /dev/null
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/language.py
@@ -0,0 +1,36 @@
+import os
+import unicodedata
+
+if not os.getenv("FTLANG_CACHE"):
+ current_file_path = os.path.abspath(__file__)
+ current_dir = os.path.dirname(current_file_path)
+ root_dir = os.path.dirname(current_dir)
+ ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
+ os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
+ # print(os.getenv("FTLANG_CACHE"))
+
+from fast_langdetect import detect_language
+
+
+def detect_lang(text: str) -> str:
+
+ if len(text) == 0:
+ return ""
+ try:
+ lang_upper = detect_language(text)
+ except:
+ html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+ lang_upper = detect_language(html_no_ctrl_chars)
+ try:
+ lang = lang_upper.lower()
+ except:
+ lang = ""
+ return lang
+
+
+if __name__ == '__main__':
+ print(os.getenv("FTLANG_CACHE"))
+ print(detect_lang("This is a test."))
+ print(detect_lang("This is a test"))
+ print(detect_lang("这个是中文测试。"))
+ print(detect_lang("这个是中文测试。"))
diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py b/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
new file mode 100644
index 00000000..5708b477
--- /dev/null
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
@@ -0,0 +1,31 @@
+import re
+
+
+def escape_special_markdown_char(pymu_blocks):
+ """
+ 转义正文里对markdown语法有特殊意义的字符
+ """
+ special_chars = ["*", "`", "~", "$"]
+ for blk in pymu_blocks:
+ for line in blk['lines']:
+ for span in line['spans']:
+ for char in special_chars:
+ span_text = span['text']
+ span_type = span.get("_type", None)
+ if span_type in ['inline-equation', 'interline-equation']:
+ continue
+ elif span_text:
+ span['text'] = span['text'].replace(char, "\\" + char)
+
+ return pymu_blocks
+
+
+def ocr_escape_special_markdown_char(content):
+ """
+ 转义正文里对markdown语法有特殊意义的字符
+ """
+ special_chars = ["*", "`", "~", "$"]
+ for char in special_chars:
+ content = content.replace(char, "\\" + char)
+
+ return content
diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py b/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
new file mode 100644
index 00000000..749c16f9
--- /dev/null
+++ b/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
@@ -0,0 +1,38 @@
+class ContentType:
+ Image = 'image'
+ Table = 'table'
+ Text = 'text'
+ InlineEquation = 'inline_equation'
+ InterlineEquation = 'interline_equation'
+
+
+class BlockType:
+ Image = 'image'
+ ImageBody = 'image_body'
+ ImageCaption = 'image_caption'
+ ImageFootnote = 'image_footnote'
+ Table = 'table'
+ TableBody = 'table_body'
+ TableCaption = 'table_caption'
+ TableFootnote = 'table_footnote'
+ Text = 'text'
+ Title = 'title'
+ InterlineEquation = 'interline_equation'
+ Footnote = 'footnote'
+ Discarded = 'discarded'
+
+
+class CategoryId:
+ Title = 0
+ Text = 1
+ Abandon = 2
+ ImageBody = 3
+ ImageCaption = 4
+ TableBody = 5
+ TableCaption = 6
+ TableFootnote = 7
+ InterlineEquation_Layout = 8
+ InlineEquation = 13
+ InterlineEquation_YOLO = 14
+ OcrText = 15
+ ImageFootnote = 101
diff --git a/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py b/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
new file mode 100644
index 00000000..7683efa3
--- /dev/null
+++ b/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
@@ -0,0 +1,169 @@
+import re
+import wordninja
+from .libs.language import detect_lang
+from .libs.markdown_utils import ocr_escape_special_markdown_char
+from .libs.ocr_content_type import BlockType, ContentType
+
+
+def __is_hyphen_at_line_end(line):
+ """
+ Check if a line ends with one or more letters followed by a hyphen.
+
+ Args:
+ line (str): The line of text to check.
+
+ Returns:
+ bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
+ """
+ # Use regex to check if the line ends with one or more letters followed by a hyphen
+ return bool(re.search(r'[A-Za-z]+-\s*$', line))
+
+
+def split_long_words(text):
+ segments = text.split(' ')
+ for i in range(len(segments)):
+ words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
+ for j in range(len(words)):
+ if len(words[j]) > 10:
+ words[j] = ' '.join(wordninja.split(words[j]))
+ segments[i] = ''.join(words)
+ return ' '.join(segments)
+
+
+def join_path(*args):
+ return ''.join(str(s).rstrip('/') for s in args)
+
+
+def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
+ img_buket_path):
+ markdown_with_para_and_pagination = []
+ page_no = 0
+ for page_info in pdf_info_dict:
+ paras_of_layout = page_info.get('para_blocks')
+ if not paras_of_layout:
+ continue
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
+ paras_of_layout, 'mm', img_buket_path)
+ markdown_with_para_and_pagination.append({
+ 'page_no':
+ page_no,
+ 'md_content':
+ '\n\n'.join(page_markdown)
+ })
+ page_no += 1
+ return markdown_with_para_and_pagination
+
+
+def merge_para_with_text(para_block):
+ def detect_language(text):
+ en_pattern = r'[a-zA-Z]+'
+ en_matches = re.findall(en_pattern, text)
+ en_length = sum(len(match) for match in en_matches)
+ if len(text) > 0:
+ if en_length / len(text) >= 0.5:
+ return 'en'
+ else:
+ return 'unknown'
+ else:
+ return 'empty'
+
+ para_text = ''
+ for line in para_block['lines']:
+ line_text = ''
+ line_lang = ''
+ for span in line['spans']:
+ span_type = span['type']
+ if span_type == ContentType.Text:
+ line_text += span['content'].strip()
+ if line_text != '':
+ line_lang = detect_lang(line_text)
+ for span in line['spans']:
+ span_type = span['type']
+ content = ''
+ if span_type == ContentType.Text:
+ content = span['content']
+ # language = detect_lang(content)
+ language = detect_language(content)
+ if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
+ content = ocr_escape_special_markdown_char(
+ split_long_words(content))
+ else:
+ content = ocr_escape_special_markdown_char(content)
+ elif span_type == ContentType.InlineEquation:
+ content = f" ${span['content']}$ "
+ elif span_type == ContentType.InterlineEquation:
+ content = f"\n$$\n{span['content']}\n$$\n"
+
+ if content != '':
+ langs = ['zh', 'ja', 'ko']
+ if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
+ para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
+ elif line_lang == 'en':
+ # 如果是前一行带有-连字符,那么末尾不应该加空格
+ if __is_hyphen_at_line_end(content):
+ para_text += content[:-1]
+ else:
+ para_text += content + ' '
+ else:
+ para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
+ return para_text
+
+
+def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
+ mode,
+ img_buket_path=''):
+ page_markdown = []
+ for para_block in paras_of_layout:
+ para_text = ''
+ para_type = para_block['type']
+ if para_type == BlockType.Text:
+ para_text = merge_para_with_text(para_block)
+ elif para_type == BlockType.Title:
+ para_text = f'# {merge_para_with_text(para_block)}'
+ elif para_type == BlockType.InterlineEquation:
+ para_text = merge_para_with_text(para_block)
+ elif para_type == BlockType.Image:
+ if mode == 'nlp':
+ continue
+ elif mode == 'mm':
+ for block in para_block['blocks']: # 1st.拼image_body
+ if block['type'] == BlockType.ImageBody:
+ for line in block['lines']:
+ for span in line['spans']:
+ if span['type'] == ContentType.Image:
+ para_text += f"\n}) \n"
+ for block in para_block['blocks']: # 2nd.拼image_caption
+ if block['type'] == BlockType.ImageCaption:
+ para_text += merge_para_with_text(block)
+ for block in para_block['blocks']: # 2nd.拼image_caption
+ if block['type'] == BlockType.ImageFootnote:
+ para_text += merge_para_with_text(block)
+ elif para_type == BlockType.Table:
+ if mode == 'nlp':
+ continue
+ elif mode == 'mm':
+ for block in para_block['blocks']: # 1st.拼table_caption
+ if block['type'] == BlockType.TableCaption:
+ para_text += merge_para_with_text(block)
+ for block in para_block['blocks']: # 2nd.拼table_body
+ if block['type'] == BlockType.TableBody:
+ for line in block['lines']:
+ for span in line['spans']:
+ if span['type'] == ContentType.Table:
+ # if processed by table model
+ if span.get('latex', ''):
+ para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+ elif span.get('html', ''):
+ para_text += f"\n\n{span['html']}\n\n"
+ else:
+ para_text += f"\n}) \n"
+ for block in para_block['blocks']: # 3rd.拼table_footnote
+ if block['type'] == BlockType.TableFootnote:
+ para_text += merge_para_with_text(block)
+
+ if para_text.strip() == '':
+ continue
+ else:
+ page_markdown.append(para_text.strip() + ' ')
+
+ return page_markdown
diff --git a/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz b/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
new file mode 100644
index 00000000..1fb85b35
Binary files /dev/null and b/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz differ
diff --git a/projects/web_demo/web_demo/config/config.yaml b/projects/web_demo/web_demo/config/config.yaml
index d2ca3577..9c48062b 100644
--- a/projects/web_demo/web_demo/config/config.yaml
+++ b/projects/web_demo/web_demo/config/config.yaml
@@ -13,6 +13,8 @@ BaseConfig: &base
PDF_ANALYSIS_FOLDER: "analysis_pdf"
# 前端项目打包的路径
REACT_APP_DIST: "../../web/dist/"
+ # 文件访问路径
+ FILE_API: "/api/v2/analysis/pdf_img?as_attachment=False"
# 开发配置
DevelopmentConfig: