diff --git a/README.md b/README.md index 6c04379f..99d47e0a 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c - Preserve the structure of the original document, including headings, paragraphs, lists, etc. - Extract images, image descriptions, tables, table titles, and footnotes. - Automatically recognize and convert formulas in the document to LaTeX format. -- Automatically recognize and convert tables in the document to LaTeX or HTML format. +- Automatically recognize and convert tables in the document to HTML format. - Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality. - OCR supports detection and recognition of 84 languages. - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats. @@ -185,17 +185,11 @@ There are three different ways to experience MinerU: GPU Hardware Support List - Minimum Requirement 8G+ VRAM - 3060ti/3070/4060
- 8G VRAM enables layout, formula recognition acceleration and OCR acceleration + GPU VRAM 8GB or more + 2080~2080Ti / 3060Ti~3090Ti / 4060~4090
+ 8G VRAM can enable all acceleration features None - - Recommended Configuration 10G+ VRAM - 3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090
- 10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously - - ### Online Demo @@ -247,7 +241,7 @@ You can modify certain configurations in this file to enable or disable features "enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false". }, "table-config": { - "model": "rapid_table", // When using structEqTable, please change to "struct_eqtable". + "model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable". "enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true". "max_time": 400 } diff --git a/README_zh-CN.md b/README_zh-CN.md index 4d8aec0d..e1648dbd 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -121,7 +121,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c - 保留原文档的结构,包括标题、段落、列表等 - 提取图像、图片描述、表格、表格标题及脚注 - 自动识别并转换文档中的公式为LaTeX格式 -- 自动识别并转换文档中的表格为LaTeX或HTML格式 +- 自动识别并转换文档中的表格为HTML格式 - 自动检测扫描版PDF和乱码PDF,并启用OCR功能 - OCR支持84种语言的检测与识别 - 支持多种输出格式,如多模态与NLP的Markdown、按阅读顺序排序的JSON、含有丰富信息的中间格式等 @@ -186,17 +186,13 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c GPU硬件支持列表 - 最低要求 8G+显存 - 3060ti/3070/4060
- 8G显存可开启全部加速功能(表格仅限rapid_table) + 显存8G以上 + + 2080~2080Ti / 3060Ti~3090Ti / 4060~4090
+ 8G显存及以上可开启全部加速功能 None - - 推荐配置 10G+显存 - 3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090
- 10G显存及以上可开启全部加速功能
- - + ### 在线体验 @@ -251,7 +247,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h "enable": true // 公式识别功能默认是开启的,如果需要关闭请修改此处的值为"false" }, "table-config": { - "model": "rapid_table", // 使用structEqTable请修改为"struct_eqtable" + "model": "rapid_table", // 默认使用"rapid_table",可以切换为"tablemaster"和"struct_eqtable" "enable": false, // 表格识别功能默认是关闭的,如果需要开启请修改此处的值为"true" "max_time": 400 } diff --git a/projects/web_demo/web_demo/api/analysis/pdf_ext.py b/projects/web_demo/web_demo/api/analysis/pdf_ext.py index b3a4a98d..1796677a 100644 --- a/projects/web_demo/web_demo/api/analysis/pdf_ext.py +++ b/projects/web_demo/web_demo/api/analysis/pdf_ext.py @@ -1,5 +1,7 @@ import json import re +import os +import shutil import traceback from pathlib import Path from flask import current_app, url_for @@ -7,7 +9,7 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.pipe.UNIPipe import UNIPipe import magic_pdf.model as model_config from magic_pdf.libs.json_compressor import JsonCompressor -from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para_and_pagination +from common.mk_markdown.mk_markdown import ocr_mk_mm_markdown_with_para_and_pagination from .ext import find_file from ..extentions import app, db from .models import AnalysisPdf, AnalysisTask @@ -17,7 +19,7 @@ from loguru import logger model_config.__use_inside_model__ = True -def analysis_pdf(image_dir, pdf_bytes, is_ocr=False): +def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False): try: model_json = [] # model_json传空list使用内置模型解析 logger.info(f"is_ocr: {is_ocr}") @@ -40,7 +42,7 @@ def analysis_pdf(image_dir, pdf_bytes, is_ocr=False): pipe.pipe_parse() pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data()) pdf_info_list = pdf_mid_data["pdf_info"] - md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_dir), + md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix), ensure_ascii=False) bbox_info = get_bbox_info(pdf_info_list) return md_content, bbox_info @@ -77,20 +79,22 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id): logger.info(f"image_dir: {image_dir}") if not Path(image_dir).exists(): Path(image_dir).mkdir(parents=True, exist_ok=True) + else: + # 清空image_dir,避免同文件多次解析图片积累 + shutil.rmtree(image_dir, ignore_errors=True) + os.makedirs(image_dir, exist_ok=True) + + # 获取文件内容 with open(pdf_path, 'rb') as file: pdf_bytes = file.read() - md_content, bbox_info = analysis_pdf(image_dir, pdf_bytes, is_ocr) - img_list = Path(image_dir).glob('*') if Path(image_dir).exists() else [] - - pdf_name = Path(pdf_path).name + # 生成图片链接 with app.app_context(): - for img in img_list: - img_name = Path(img).name - regex = re.compile(fr'.*\((.*?{img_name})') - regex_result = regex.search(md_content) - if regex_result: - img_url = url_for('analysis.imgview', filename=img_name, as_attachment=False) - md_content = md_content.replace(regex_result.group(1), f"{img_url}&pdf={pdf_name}") + image_url_prefix = f"http://{current_app.config['SERVER_NAME']}{current_app.config['FILE_API']}&pdf={Path(pdf_path).name}&filename=" + # 解析文件 + md_content, bbox_info = analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr) + + # ############ markdown ############# + pdf_name = Path(pdf_path).name full_md_content = "" for item in json.loads(md_content): diff --git a/projects/web_demo/web_demo/app.py b/projects/web_demo/web_demo/app.py index d98c0e56..61a01559 100644 --- a/projects/web_demo/web_demo/app.py +++ b/projects/web_demo/web_demo/app.py @@ -42,7 +42,7 @@ if database: ip_address = get_local_ip() port = config.get("PORT", 5559) # 配置 SERVER_NAME -config['SERVER_NAME'] = f'{ip_address}:5559' +config['SERVER_NAME'] = f'{ip_address}:{port}' # 配置 APPLICATION_ROOT config['APPLICATION_ROOT'] = '/' # 配置 PREFERRED_URL_SCHEME diff --git a/projects/web_demo/web_demo/common/mk_markdown/__init__.py b/projects/web_demo/web_demo/common/mk_markdown/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py b/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/language.py b/projects/web_demo/web_demo/common/mk_markdown/libs/language.py new file mode 100644 index 00000000..396c9008 --- /dev/null +++ b/projects/web_demo/web_demo/common/mk_markdown/libs/language.py @@ -0,0 +1,36 @@ +import os +import unicodedata + +if not os.getenv("FTLANG_CACHE"): + current_file_path = os.path.abspath(__file__) + current_dir = os.path.dirname(current_file_path) + root_dir = os.path.dirname(current_dir) + ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect') + os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir) + # print(os.getenv("FTLANG_CACHE")) + +from fast_langdetect import detect_language + + +def detect_lang(text: str) -> str: + + if len(text) == 0: + return "" + try: + lang_upper = detect_language(text) + except: + html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) + lang_upper = detect_language(html_no_ctrl_chars) + try: + lang = lang_upper.lower() + except: + lang = "" + return lang + + +if __name__ == '__main__': + print(os.getenv("FTLANG_CACHE")) + print(detect_lang("This is a test.")) + print(detect_lang("This is a test")) + print(detect_lang("这个是中文测试。")) + print(detect_lang("这个是中文测试。")) diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py b/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py new file mode 100644 index 00000000..5708b477 --- /dev/null +++ b/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py @@ -0,0 +1,31 @@ +import re + + +def escape_special_markdown_char(pymu_blocks): + """ + 转义正文里对markdown语法有特殊意义的字符 + """ + special_chars = ["*", "`", "~", "$"] + for blk in pymu_blocks: + for line in blk['lines']: + for span in line['spans']: + for char in special_chars: + span_text = span['text'] + span_type = span.get("_type", None) + if span_type in ['inline-equation', 'interline-equation']: + continue + elif span_text: + span['text'] = span['text'].replace(char, "\\" + char) + + return pymu_blocks + + +def ocr_escape_special_markdown_char(content): + """ + 转义正文里对markdown语法有特殊意义的字符 + """ + special_chars = ["*", "`", "~", "$"] + for char in special_chars: + content = content.replace(char, "\\" + char) + + return content diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py b/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py new file mode 100644 index 00000000..749c16f9 --- /dev/null +++ b/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py @@ -0,0 +1,38 @@ +class ContentType: + Image = 'image' + Table = 'table' + Text = 'text' + InlineEquation = 'inline_equation' + InterlineEquation = 'interline_equation' + + +class BlockType: + Image = 'image' + ImageBody = 'image_body' + ImageCaption = 'image_caption' + ImageFootnote = 'image_footnote' + Table = 'table' + TableBody = 'table_body' + TableCaption = 'table_caption' + TableFootnote = 'table_footnote' + Text = 'text' + Title = 'title' + InterlineEquation = 'interline_equation' + Footnote = 'footnote' + Discarded = 'discarded' + + +class CategoryId: + Title = 0 + Text = 1 + Abandon = 2 + ImageBody = 3 + ImageCaption = 4 + TableBody = 5 + TableCaption = 6 + TableFootnote = 7 + InterlineEquation_Layout = 8 + InlineEquation = 13 + InterlineEquation_YOLO = 14 + OcrText = 15 + ImageFootnote = 101 diff --git a/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py b/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py new file mode 100644 index 00000000..7683efa3 --- /dev/null +++ b/projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py @@ -0,0 +1,169 @@ +import re +import wordninja +from .libs.language import detect_lang +from .libs.markdown_utils import ocr_escape_special_markdown_char +from .libs.ocr_content_type import BlockType, ContentType + + +def __is_hyphen_at_line_end(line): + """ + Check if a line ends with one or more letters followed by a hyphen. + + Args: + line (str): The line of text to check. + + Returns: + bool: True if the line ends with one or more letters followed by a hyphen, False otherwise. + """ + # Use regex to check if the line ends with one or more letters followed by a hyphen + return bool(re.search(r'[A-Za-z]+-\s*$', line)) + + +def split_long_words(text): + segments = text.split(' ') + for i in range(len(segments)): + words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE) + for j in range(len(words)): + if len(words[j]) > 10: + words[j] = ' '.join(wordninja.split(words[j])) + segments[i] = ''.join(words) + return ' '.join(segments) + + +def join_path(*args): + return ''.join(str(s).rstrip('/') for s in args) + + +def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, + img_buket_path): + markdown_with_para_and_pagination = [] + page_no = 0 + for page_info in pdf_info_dict: + paras_of_layout = page_info.get('para_blocks') + if not paras_of_layout: + continue + page_markdown = ocr_mk_markdown_with_para_core_v2( + paras_of_layout, 'mm', img_buket_path) + markdown_with_para_and_pagination.append({ + 'page_no': + page_no, + 'md_content': + '\n\n'.join(page_markdown) + }) + page_no += 1 + return markdown_with_para_and_pagination + + +def merge_para_with_text(para_block): + def detect_language(text): + en_pattern = r'[a-zA-Z]+' + en_matches = re.findall(en_pattern, text) + en_length = sum(len(match) for match in en_matches) + if len(text) > 0: + if en_length / len(text) >= 0.5: + return 'en' + else: + return 'unknown' + else: + return 'empty' + + para_text = '' + for line in para_block['lines']: + line_text = '' + line_lang = '' + for span in line['spans']: + span_type = span['type'] + if span_type == ContentType.Text: + line_text += span['content'].strip() + if line_text != '': + line_lang = detect_lang(line_text) + for span in line['spans']: + span_type = span['type'] + content = '' + if span_type == ContentType.Text: + content = span['content'] + # language = detect_lang(content) + language = detect_language(content) + if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 + content = ocr_escape_special_markdown_char( + split_long_words(content)) + else: + content = ocr_escape_special_markdown_char(content) + elif span_type == ContentType.InlineEquation: + content = f" ${span['content']}$ " + elif span_type == ContentType.InterlineEquation: + content = f"\n$$\n{span['content']}\n$$\n" + + if content != '': + langs = ['zh', 'ja', 'ko'] + if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断 + para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔 + elif line_lang == 'en': + # 如果是前一行带有-连字符,那么末尾不应该加空格 + if __is_hyphen_at_line_end(content): + para_text += content[:-1] + else: + para_text += content + ' ' + else: + para_text += content + ' ' # 西方文本语境下 content间需要空格分隔 + return para_text + + +def ocr_mk_markdown_with_para_core_v2(paras_of_layout, + mode, + img_buket_path=''): + page_markdown = [] + for para_block in paras_of_layout: + para_text = '' + para_type = para_block['type'] + if para_type == BlockType.Text: + para_text = merge_para_with_text(para_block) + elif para_type == BlockType.Title: + para_text = f'# {merge_para_with_text(para_block)}' + elif para_type == BlockType.InterlineEquation: + para_text = merge_para_with_text(para_block) + elif para_type == BlockType.Image: + if mode == 'nlp': + continue + elif mode == 'mm': + for block in para_block['blocks']: # 1st.拼image_body + if block['type'] == BlockType.ImageBody: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.Image: + para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" + for block in para_block['blocks']: # 2nd.拼image_caption + if block['type'] == BlockType.ImageCaption: + para_text += merge_para_with_text(block) + for block in para_block['blocks']: # 2nd.拼image_caption + if block['type'] == BlockType.ImageFootnote: + para_text += merge_para_with_text(block) + elif para_type == BlockType.Table: + if mode == 'nlp': + continue + elif mode == 'mm': + for block in para_block['blocks']: # 1st.拼table_caption + if block['type'] == BlockType.TableCaption: + para_text += merge_para_with_text(block) + for block in para_block['blocks']: # 2nd.拼table_body + if block['type'] == BlockType.TableBody: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.Table: + # if processed by table model + if span.get('latex', ''): + para_text += f"\n\n$\n {span['latex']}\n$\n\n" + elif span.get('html', ''): + para_text += f"\n\n{span['html']}\n\n" + else: + para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" + for block in para_block['blocks']: # 3rd.拼table_footnote + if block['type'] == BlockType.TableFootnote: + para_text += merge_para_with_text(block) + + if para_text.strip() == '': + continue + else: + page_markdown.append(para_text.strip() + ' ') + + return page_markdown diff --git a/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz b/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz new file mode 100644 index 00000000..1fb85b35 Binary files /dev/null and b/projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz differ diff --git a/projects/web_demo/web_demo/config/config.yaml b/projects/web_demo/web_demo/config/config.yaml index d2ca3577..9c48062b 100644 --- a/projects/web_demo/web_demo/config/config.yaml +++ b/projects/web_demo/web_demo/config/config.yaml @@ -13,6 +13,8 @@ BaseConfig: &base PDF_ANALYSIS_FOLDER: "analysis_pdf" # 前端项目打包的路径 REACT_APP_DIST: "../../web/dist/" + # 文件访问路径 + FILE_API: "/api/v2/analysis/pdf_img?as_attachment=False" # 开发配置 DevelopmentConfig: