diff --git a/mineru/backend/vlm/predictor.py b/mineru/backend/vlm/predictor.py index f4976ec9..0d968445 100644 --- a/mineru/backend/vlm/predictor.py +++ b/mineru/backend/vlm/predictor.py @@ -77,7 +77,7 @@ def get_predictor( raise ImportError( "sglang is not installed, so sglang-engine backend cannot be used. " "If you need to use sglang-engine backend for inference, " - "please install sglang[all]==0.4.7 or a newer version." + "please install sglang[all]==0.4.8 or a newer version." ) predictor = SglangEnginePredictor( server_args=ServerArgs(model_path, **kwargs), diff --git a/projects/README.md b/projects/README.md index 6ffe4a0e..6a08be14 100644 --- a/projects/README.md +++ b/projects/README.md @@ -2,10 +2,6 @@ ## Project List -- Projects compatible with version 2.0: - - [gradio_app](./gradio_app/README.md): Web application based on Gradio - - Projects not yet compatible with version 2.0: - - [web_api](./web_api/README.md): Web API based on FastAPI - [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe - [mcp](./mcp/README.md): MCP server based on the official API diff --git a/projects/README_zh-CN.md b/projects/README_zh-CN.md index 8ede7844..bdd6a245 100644 --- a/projects/README_zh-CN.md +++ b/projects/README_zh-CN.md @@ -2,10 +2,6 @@ ## 项目列表 -- 已兼容2.0版本的项目列表 - - [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用 - - 未兼容2.0版本的项目列表 - - [web_api](./web_api/README.md): 基于 FastAPI 的 Web API - [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理 - [mcp](./mcp/README.md): 基于官方api的mcp server diff --git a/projects/gradio_app/README.md b/projects/gradio_app/README.md deleted file mode 100644 index c04b068e..00000000 --- a/projects/gradio_app/README.md +++ /dev/null @@ -1,24 +0,0 @@ -## Installation - -MinerU(>=0.8.0) - > If you already have a functioning MinerU environment, you can skip this step. - > -[Deploy in CPU environment](https://github.com/opendatalab/MinerU?tab=readme-ov-file#quick-cpu-demo) - -[Deploy in GPU environment](https://github.com/opendatalab/MinerU?tab=readme-ov-file#using-gpu) - -Third-party Software - -```bash -pip install gradio gradio-pdf -``` - -## Start Gradio App - -```bash -python app.py -``` - -## Use Gradio App - -Access http://127.0.0.1:7860 in your web browser \ No newline at end of file diff --git a/projects/gradio_app/README_zh-CN.md b/projects/gradio_app/README_zh-CN.md deleted file mode 100644 index ce9b92c5..00000000 --- a/projects/gradio_app/README_zh-CN.md +++ /dev/null @@ -1,24 +0,0 @@ -## 安装 - -MinerU(>=0.8.0) - >如已有正常运行的MinerU环境则可以跳过此步骤 -> -[在CPU环境部署](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8cpu%E5%BF%AB%E9%80%9F%E4%BD%93%E9%AA%8C) - -[在GPU环境部署](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8gpu) - -第三方软件 - -```bash -pip install gradio gradio-pdf -``` - -## 启动gradio应用 - -```bash -python app.py -``` - -## 使用gradio应用 - -在浏览器中访问 http://127.0.0.1:7860 \ No newline at end of file diff --git a/projects/gradio_app/app.py b/projects/gradio_app/app.py deleted file mode 100644 index 92d8d5b7..00000000 --- a/projects/gradio_app/app.py +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright (c) Opendatalab. All rights reserved. - -import base64 -import os -import re -import time -import zipfile -from pathlib import Path - -import gradio as gr -from gradio_pdf import PDF -from loguru import logger - -from mineru.cli.common import prepare_env, do_parse, read_fn -from mineru.utils.hash_utils import str_sha256 - - -def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language): - os.makedirs(output_dir, exist_ok=True) - - try: - file_name = f'{str(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}' - pdf_data = read_fn(doc_path) - if is_ocr: - parse_method = 'ocr' - else: - parse_method = 'auto' - local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method) - do_parse( - output_dir=output_dir, - pdf_file_names=[file_name], - pdf_bytes_list=[pdf_data], - p_lang_list=[language], - parse_method=parse_method, - end_page_id=end_page_id, - p_formula_enable=formula_enable, - p_table_enable=table_enable, - ) - return local_md_dir, file_name - except Exception as e: - logger.exception(e) - - -def compress_directory_to_zip(directory_path, output_zip_path): - """压缩指定目录到一个 ZIP 文件。 - - :param directory_path: 要压缩的目录路径 - :param output_zip_path: 输出的 ZIP 文件路径 - """ - try: - with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: - - # 遍历目录中的所有文件和子目录 - for root, dirs, files in os.walk(directory_path): - for file in files: - # 构建完整的文件路径 - file_path = os.path.join(root, file) - # 计算相对路径 - arcname = os.path.relpath(file_path, directory_path) - # 添加文件到 ZIP 文件 - zipf.write(file_path, arcname) - return 0 - except Exception as e: - logger.exception(e) - return -1 - - -def image_to_base64(image_path): - with open(image_path, 'rb') as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') - - -def replace_image_with_base64(markdown_text, image_dir_path): - # 匹配Markdown中的图片标签 - pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)' - - # 替换图片链接 - def replace(match): - relative_path = match.group(1) - full_path = os.path.join(image_dir_path, relative_path) - base64_image = image_to_base64(full_path) - return f'![{relative_path}](data:image/jpeg;base64,{base64_image})' - - # 应用替换 - return re.sub(pattern, replace, markdown_text) - - -def to_markdown(file_path, end_pages, is_ocr, formula_enable, table_enable, language): - file_path = to_pdf(file_path) - # 获取识别的md文件以及压缩包文件路径 - local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language) - archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip') - zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path) - if zip_archive_success == 0: - logger.info('压缩成功') - else: - logger.error('压缩失败') - md_path = os.path.join(local_md_dir, file_name + '.md') - with open(md_path, 'r', encoding='utf-8') as f: - txt_content = f.read() - md_content = replace_image_with_base64(txt_content, local_md_dir) - # 返回转换后的PDF路径 - new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf') - - return md_content, txt_content, archive_zip_path, new_pdf_path - - -latex_delimiters = [ - {'left': '$$', 'right': '$$', 'display': True}, - {'left': '$', 'right': '$', 'display': False}, - {'left': '\\(', 'right': '\\)', 'display': False}, - {'left': '\\[', 'right': '\\]', 'display': True}, -] - - -with open('header.html', 'r') as file: - header = file.read() - - -latin_lang = [ - 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126 - 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', - 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', - 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german' -] -arabic_lang = ['ar', 'fa', 'ug', 'ur'] -cyrillic_lang = [ - 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126 - 'dar', 'inh', 'che', 'lbe', 'lez', 'tab' -] -devanagari_lang = [ - 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126 - 'sa', 'bgc' -] -other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka'] -add_lang = ['latin', 'arabic', 'cyrillic', 'devanagari'] - -# all_lang = ['', 'auto'] -all_lang = [] -# all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang]) -all_lang.extend([*other_lang, *add_lang]) - - -def safe_stem(file_path): - stem = Path(file_path).stem - # 只保留字母、数字、下划线和点,其他字符替换为下划线 - return re.sub(r'[^\w.]', '_', stem) - - -def to_pdf(file_path): - - if file_path is None: - return None - - pdf_bytes = read_fn(file_path) - - # unique_filename = f'{uuid.uuid4()}.pdf' - unique_filename = f'{safe_stem(file_path)}.pdf' - - # 构建完整的文件路径 - tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename) - - # 将字节数据写入文件 - with open(tmp_file_path, 'wb') as tmp_pdf_file: - tmp_pdf_file.write(pdf_bytes) - - return tmp_file_path - - -if __name__ == '__main__': - with gr.Blocks() as demo: - gr.HTML(header) - with gr.Row(): - with gr.Column(variant='panel', scale=5): - with gr.Row(): - file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg']) - with gr.Row(equal_height=True): - with gr.Column(scale=4): - max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages') - with gr.Column(scale=1): - language = gr.Dropdown(all_lang, label='Language', value='ch') - with gr.Row(): - is_ocr = gr.Checkbox(label='Force enable OCR', value=False) - formula_enable = gr.Checkbox(label='Enable formula recognition', value=True) - table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True) - with gr.Row(): - change_bu = gr.Button('Convert') - clear_bu = gr.ClearButton(value='Clear') - pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800) - with gr.Accordion('Examples:'): - example_root = os.path.join(os.path.dirname(__file__), 'examples') - gr.Examples( - examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if - _.endswith('pdf')], - inputs=file - ) - - with gr.Column(variant='panel', scale=5): - output_file = gr.File(label='convert result', interactive=False) - with gr.Tabs(): - with gr.Tab('Markdown rendering'): - md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True, - latex_delimiters=latex_delimiters, - line_breaks=True) - with gr.Tab('Markdown text'): - md_text = gr.TextArea(lines=45, show_copy_button=True) - file.change(fn=to_pdf, inputs=file, outputs=pdf_show) - change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, formula_enable, table_enable, language], - outputs=[md, md_text, output_file, pdf_show]) - clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr]) - - demo.launch(server_name='0.0.0.0') diff --git a/projects/gradio_app/examples/2list_1table.pdf b/projects/gradio_app/examples/2list_1table.pdf deleted file mode 100644 index dd9650bf..00000000 Binary files a/projects/gradio_app/examples/2list_1table.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/3list_1table.pdf b/projects/gradio_app/examples/3list_1table.pdf deleted file mode 100644 index 5782751a..00000000 Binary files a/projects/gradio_app/examples/3list_1table.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/academic_paper_formula.pdf b/projects/gradio_app/examples/academic_paper_formula.pdf deleted file mode 100644 index f1381cd2..00000000 Binary files a/projects/gradio_app/examples/academic_paper_formula.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/academic_paper_img_formula.pdf b/projects/gradio_app/examples/academic_paper_img_formula.pdf deleted file mode 100644 index ab8ce7ea..00000000 Binary files a/projects/gradio_app/examples/academic_paper_img_formula.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/academic_paper_list.pdf b/projects/gradio_app/examples/academic_paper_list.pdf deleted file mode 100644 index ab1d86b5..00000000 Binary files a/projects/gradio_app/examples/academic_paper_list.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/complex_layout.pdf b/projects/gradio_app/examples/complex_layout.pdf deleted file mode 100644 index a4fc9c0f..00000000 Binary files a/projects/gradio_app/examples/complex_layout.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/complex_layout_para_split_list.pdf b/projects/gradio_app/examples/complex_layout_para_split_list.pdf deleted file mode 100644 index ce34c640..00000000 Binary files a/projects/gradio_app/examples/complex_layout_para_split_list.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/garbled_formula.pdf b/projects/gradio_app/examples/garbled_formula.pdf deleted file mode 100644 index a2c11939..00000000 Binary files a/projects/gradio_app/examples/garbled_formula.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/magazine_complex_layout_images_list.pdf b/projects/gradio_app/examples/magazine_complex_layout_images_list.pdf deleted file mode 100644 index 8718fc0f..00000000 Binary files a/projects/gradio_app/examples/magazine_complex_layout_images_list.pdf and /dev/null differ diff --git a/projects/gradio_app/examples/scanned.pdf b/projects/gradio_app/examples/scanned.pdf deleted file mode 100755 index f35e53dd..00000000 Binary files a/projects/gradio_app/examples/scanned.pdf and /dev/null differ diff --git a/projects/gradio_app/header.html b/projects/gradio_app/header.html deleted file mode 100644 index 21b9184a..00000000 --- a/projects/gradio_app/header.html +++ /dev/null @@ -1,130 +0,0 @@ - - - - - -
-
-
-

- MinerU: PDF Extraction Demo -

-
-
- -

- A one-stop, open-source, high-quality data extraction tool, supports - PDF/webpage/e-book extraction.
-

- - -
- -
- - -
- - - \ No newline at end of file diff --git a/projects/gradio_app/requirements.txt b/projects/gradio_app/requirements.txt deleted file mode 100644 index f23b6756..00000000 --- a/projects/gradio_app/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -magic-pdf[full]>=0.8.0 -gradio -gradio-pdf \ No newline at end of file diff --git a/projects/web_api/Dockerfile b/projects/web_api/Dockerfile deleted file mode 100644 index 9b9a2ece..00000000 --- a/projects/web_api/Dockerfile +++ /dev/null @@ -1,67 +0,0 @@ -FROM python:3.10-slim-bookworm AS base - -WORKDIR /app - -ENV DEBIAN_FRONTEND=noninteractive \ - LANG=C.UTF-8 \ - PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 \ - PIP_NO_CACHE_DIR=1 - - -FROM base AS build - -# Update the package list and install necessary packages -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Build Python dependencies -COPY requirements.txt . -RUN python -m venv /app/venv && \ - . /app/venv/bin/activate && \ - pip install -r requirements.txt -# pip uninstall -y paddlepaddle && \ -# pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \ -# paddlepaddle-gpu==3.0.0rc1 - -# Download models -COPY download_models.py . -RUN . /app/venv/bin/activate && \ - ./download_models.py - - -FROM base AS prod - -# Copy Python dependencies and models from the build stage -COPY --from=build /app/venv /app/venv -COPY --from=build /opt/models /opt/models -COPY --from=build /opt/layoutreader /opt/layoutreader - -# Update the package list and install necessary packages -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - libgl1 \ - libglib2.0-0 \ - libgomp1 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Create volume for paddleocr models -# RUN mkdir -p /root/.paddleocr -# VOLUME [ "/root/.paddleocr" ] - -# Copy the app and its configuration file -COPY entrypoint.sh /app/entrypoint.sh -COPY magic-pdf.json /root/magic-pdf.json -COPY app.py /app/app.py - -# Expose the port that FastAPI will run on -EXPOSE 8000 - -# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000 -ENTRYPOINT [ "/app/entrypoint.sh" ] -CMD ["--host", "0.0.0.0", "--port", "8000"] diff --git a/projects/web_api/README.md b/projects/web_api/README.md deleted file mode 100644 index 251cd3ea..00000000 --- a/projects/web_api/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# 基于MinerU的PDF解析API - -- MinerU的GPU镜像构建 -- 基于FastAPI的PDF解析接口 - -## 构建方式 - -``` -docker build -t mineru-api . -``` - -或者使用代理: - -``` -docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_proxy=http://127.0.0.1:7890 -t mineru-api . -``` - -## 启动命令 - -``` -docker run --rm -it --gpus=all -p 8000:8000 mineru-api -``` - -## 测试参数 - -访问地址: - -``` -http://localhost:8000/docs -http://127.0.0.1:8000/docs -``` \ No newline at end of file diff --git a/projects/web_api/app.py b/projects/web_api/app.py deleted file mode 100644 index 722aa287..00000000 --- a/projects/web_api/app.py +++ /dev/null @@ -1,305 +0,0 @@ -import json -import os -from base64 import b64encode -from glob import glob -from io import StringIO -import tempfile -from typing import Tuple, Union - -import uvicorn -from fastapi import FastAPI, HTTPException, UploadFile -from fastapi.responses import JSONResponse -from loguru import logger - -from magic_pdf.data.read_api import read_local_images, read_local_office -import magic_pdf.model as model_config -from magic_pdf.config.enums import SupportedPdfParseMethod -from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter -from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter -from magic_pdf.data.dataset import ImageDataset, PymuDocDataset -from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config -from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze -from magic_pdf.operators.models import InferenceResult -from magic_pdf.operators.pipes import PipeResult -from fastapi import Form - -model_config.__use_inside_model__ = True - -app = FastAPI() - -pdf_extensions = [".pdf"] -office_extensions = [".ppt", ".pptx", ".doc", ".docx"] -image_extensions = [".png", ".jpg", ".jpeg"] - -class MemoryDataWriter(DataWriter): - def __init__(self): - self.buffer = StringIO() - - def write(self, path: str, data: bytes) -> None: - if isinstance(data, str): - self.buffer.write(data) - else: - self.buffer.write(data.decode("utf-8")) - - def write_string(self, path: str, data: str) -> None: - self.buffer.write(data) - - def get_value(self) -> str: - return self.buffer.getvalue() - - def close(self): - self.buffer.close() - - -def init_writers( - file_path: str = None, - file: UploadFile = None, - output_path: str = None, - output_image_path: str = None, -) -> Tuple[ - Union[S3DataWriter, FileBasedDataWriter], - Union[S3DataWriter, FileBasedDataWriter], - bytes, -]: - """ - Initialize writers based on path type - - Args: - file_path: file path (local path or S3 path) - file: Uploaded file object - output_path: Output directory path - output_image_path: Image output directory path - - Returns: - Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content - """ - file_extension:str = None - if file_path: - is_s3_path = file_path.startswith("s3://") - if is_s3_path: - bucket = get_bucket_name(file_path) - ak, sk, endpoint = get_s3_config(bucket) - - writer = S3DataWriter( - output_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint - ) - image_writer = S3DataWriter( - output_image_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint - ) - # 临时创建reader读取文件内容 - temp_reader = S3DataReader( - "", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint - ) - file_bytes = temp_reader.read(file_path) - file_extension = os.path.splitext(file_path)[1] - else: - writer = FileBasedDataWriter(output_path) - image_writer = FileBasedDataWriter(output_image_path) - os.makedirs(output_image_path, exist_ok=True) - with open(file_path, "rb") as f: - file_bytes = f.read() - file_extension = os.path.splitext(file_path)[1] - else: - # 处理上传的文件 - file_bytes = file.file.read() - file_extension = os.path.splitext(file.filename)[1] - - writer = FileBasedDataWriter(output_path) - image_writer = FileBasedDataWriter(output_image_path) - os.makedirs(output_image_path, exist_ok=True) - - return writer, image_writer, file_bytes, file_extension - - -def process_file( - file_bytes: bytes, - file_extension: str, - parse_method: str, - image_writer: Union[S3DataWriter, FileBasedDataWriter], -) -> Tuple[InferenceResult, PipeResult]: - """ - Process PDF file content - - Args: - file_bytes: Binary content of file - file_extension: file extension - parse_method: Parse method ('ocr', 'txt', 'auto') - image_writer: Image writer - - Returns: - Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result - """ - - ds: Union[PymuDocDataset, ImageDataset] = None - if file_extension in pdf_extensions: - ds = PymuDocDataset(file_bytes) - elif file_extension in office_extensions: - # 需要使用office解析 - temp_dir = tempfile.mkdtemp() - with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f: - f.write(file_bytes) - ds = read_local_office(temp_dir)[0] - elif file_extension in image_extensions: - # 需要使用ocr解析 - temp_dir = tempfile.mkdtemp() - with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f: - f.write(file_bytes) - ds = read_local_images(temp_dir)[0] - infer_result: InferenceResult = None - pipe_result: PipeResult = None - - if parse_method == "ocr": - infer_result = ds.apply(doc_analyze, ocr=True) - pipe_result = infer_result.pipe_ocr_mode(image_writer) - elif parse_method == "txt": - infer_result = ds.apply(doc_analyze, ocr=False) - pipe_result = infer_result.pipe_txt_mode(image_writer) - else: # auto - if ds.classify() == SupportedPdfParseMethod.OCR: - infer_result = ds.apply(doc_analyze, ocr=True) - pipe_result = infer_result.pipe_ocr_mode(image_writer) - else: - infer_result = ds.apply(doc_analyze, ocr=False) - pipe_result = infer_result.pipe_txt_mode(image_writer) - - return infer_result, pipe_result - - -def encode_image(image_path: str) -> str: - """Encode image using base64""" - with open(image_path, "rb") as f: - return b64encode(f.read()).decode() - - -@app.post( - "/file_parse", - tags=["projects"], - summary="Parse files (supports local files and S3)", -) -async def file_parse( - file: UploadFile = None, - file_path: str = Form(None), - parse_method: str = Form("auto"), - is_json_md_dump: bool = Form(False), - output_dir: str = Form("output"), - return_layout: bool = Form(False), - return_info: bool = Form(False), - return_content_list: bool = Form(False), - return_images: bool = Form(False), -): - """ - Execute the process of converting PDF to JSON and MD, outputting MD and JSON files - to the specified directory. - - Args: - file: The PDF file to be parsed. Must not be specified together with - `file_path` - file_path: The path to the PDF file to be parsed. Must not be specified together - with `file` - parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If - results are not satisfactory, try ocr - is_json_md_dump: Whether to write parsed data to .json and .md files. Default - to False. Different stages of data will be written to different .json files - (3 in total), md content will be saved to .md file - output_dir: Output directory for results. A folder named after the PDF file - will be created to store all results - return_layout: Whether to return parsed PDF layout. Default to False - return_info: Whether to return parsed PDF info. Default to False - return_content_list: Whether to return parsed PDF content list. Default to False - """ - try: - if (file is None and file_path is None) or ( - file is not None and file_path is not None - ): - return JSONResponse( - content={"error": "Must provide either file or file_path"}, - status_code=400, - ) - - # Get PDF filename - file_name = os.path.basename(file_path if file_path else file.filename).split( - "." - )[0] - output_path = f"{output_dir}/{file_name}" - output_image_path = f"{output_path}/images" - - # Initialize readers/writers and get PDF content - writer, image_writer, file_bytes, file_extension = init_writers( - file_path=file_path, - file=file, - output_path=output_path, - output_image_path=output_image_path, - ) - - # Process PDF - infer_result, pipe_result = process_file(file_bytes, file_extension, parse_method, image_writer) - - # Use MemoryDataWriter to get results - content_list_writer = MemoryDataWriter() - md_content_writer = MemoryDataWriter() - middle_json_writer = MemoryDataWriter() - - # Use PipeResult's dump method to get data - pipe_result.dump_content_list(content_list_writer, "", "images") - pipe_result.dump_md(md_content_writer, "", "images") - pipe_result.dump_middle_json(middle_json_writer, "") - - # Get content - content_list = json.loads(content_list_writer.get_value()) - md_content = md_content_writer.get_value() - middle_json = json.loads(middle_json_writer.get_value()) - model_json = infer_result.get_infer_res() - - # If results need to be saved - if is_json_md_dump: - writer.write_string( - f"{file_name}_content_list.json", content_list_writer.get_value() - ) - writer.write_string(f"{file_name}.md", md_content) - writer.write_string( - f"{file_name}_middle.json", middle_json_writer.get_value() - ) - writer.write_string( - f"{file_name}_model.json", - json.dumps(model_json, indent=4, ensure_ascii=False), - ) - # Save visualization results - pipe_result.draw_layout(os.path.join(output_path, f"{file_name}_layout.pdf")) - pipe_result.draw_span(os.path.join(output_path, f"{file_name}_spans.pdf")) - pipe_result.draw_line_sort( - os.path.join(output_path, f"{file_name}_line_sort.pdf") - ) - infer_result.draw_model(os.path.join(output_path, f"{file_name}_model.pdf")) - - # Build return data - data = {} - if return_layout: - data["layout"] = model_json - if return_info: - data["info"] = middle_json - if return_content_list: - data["content_list"] = content_list - if return_images: - image_paths = glob(f"{output_image_path}/*.jpg") - data["images"] = { - os.path.basename( - image_path - ): f"data:image/jpeg;base64,{encode_image(image_path)}" - for image_path in image_paths - } - data["md_content"] = md_content # md_content is always returned - - # Clean up memory writers - content_list_writer.close() - md_content_writer.close() - middle_json_writer.close() - - return JSONResponse(data, status_code=200) - - except Exception as e: - logger.exception(e) - return JSONResponse(content={"error": str(e)}, status_code=500) - - -if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=8888) diff --git a/projects/web_api/download_models.py b/projects/web_api/download_models.py deleted file mode 100755 index 9d920a9e..00000000 --- a/projects/web_api/download_models.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python -from huggingface_hub import snapshot_download - -if __name__ == "__main__": - - mineru_patterns = [ - # "models/Layout/LayoutLMv3/*", - "models/Layout/YOLO/*", - "models/MFD/YOLO/*", - "models/MFR/unimernet_hf_small_2503/*", - "models/OCR/paddleocr_torch/*", - # "models/TabRec/TableMaster/*", - # "models/TabRec/StructEqTable/*", - ] - model_dir = snapshot_download( - "opendatalab/PDF-Extract-Kit-1.0", - allow_patterns=mineru_patterns, - local_dir="/opt/", - ) - - layoutreader_pattern = [ - "*.json", - "*.safetensors", - ] - layoutreader_model_dir = snapshot_download( - "hantian/layoutreader", - allow_patterns=layoutreader_pattern, - local_dir="/opt/layoutreader/", - ) - - model_dir = model_dir + "/models" - print(f"model_dir is: {model_dir}") - print(f"layoutreader_model_dir is: {layoutreader_model_dir}") diff --git a/projects/web_api/entrypoint.sh b/projects/web_api/entrypoint.sh deleted file mode 100755 index e0e621fc..00000000 --- a/projects/web_api/entrypoint.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -. /app/venv/bin/activate -exec uvicorn app:app "$@" diff --git a/projects/web_api/magic-pdf.json b/projects/web_api/magic-pdf.json deleted file mode 100644 index a2dc7de0..00000000 --- a/projects/web_api/magic-pdf.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "bucket_info":{ - "bucket-name-1":["ak", "sk", "endpoint"], - "bucket-name-2":["ak", "sk", "endpoint"] - }, - "models-dir":"/opt/models", - "layoutreader-model-dir":"/opt/layoutreader", - "device-mode":"cuda", - "layout-config": { - "model": "doclayout_yolo" - }, - "formula-config": { - "mfd_model": "yolo_v8_mfd", - "mfr_model": "unimernet_small", - "enable": true - }, - "table-config": { - "model": "rapid_table", - "sub_model": "slanet_plus", - "enable": true, - "max_time": 400 - }, - "llm-aided-config": { - "formula_aided": { - "api_key": "your_api_key", - "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", - "model": "qwen2.5-7b-instruct", - "enable": false - }, - "text_aided": { - "api_key": "your_api_key", - "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", - "model": "qwen2.5-7b-instruct", - "enable": false - }, - "title_aided": { - "api_key": "your_api_key", - "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", - "model": "qwen2.5-32b-instruct", - "enable": false - } - }, - "config_version": "1.2.0" -} diff --git a/projects/web_api/requirements.txt b/projects/web_api/requirements.txt deleted file mode 100644 index d9d6d9c2..00000000 --- a/projects/web_api/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -magic-pdf[full] - -fastapi -uvicorn -python-multipart