mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
fix: update sglang version requirement in error message and clean up README files
This commit is contained in:
@@ -2,10 +2,6 @@
|
||||
|
||||
## Project List
|
||||
|
||||
- Projects compatible with version 2.0:
|
||||
- [gradio_app](./gradio_app/README.md): Web application based on Gradio
|
||||
|
||||
- Projects not yet compatible with version 2.0:
|
||||
- [web_api](./web_api/README.md): Web API based on FastAPI
|
||||
- [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe
|
||||
- [mcp](./mcp/README.md): MCP server based on the official API
|
||||
|
||||
@@ -2,10 +2,6 @@
|
||||
|
||||
## 项目列表
|
||||
|
||||
- 已兼容2.0版本的项目列表
|
||||
- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
|
||||
|
||||
- 未兼容2.0版本的项目列表
|
||||
- [web_api](./web_api/README.md): 基于 FastAPI 的 Web API
|
||||
- [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理
|
||||
- [mcp](./mcp/README.md): 基于官方api的mcp server
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
## Installation
|
||||
|
||||
MinerU(>=0.8.0)
|
||||
> If you already have a functioning MinerU environment, you can skip this step.
|
||||
>
|
||||
[Deploy in CPU environment](https://github.com/opendatalab/MinerU?tab=readme-ov-file#quick-cpu-demo)
|
||||
|
||||
[Deploy in GPU environment](https://github.com/opendatalab/MinerU?tab=readme-ov-file#using-gpu)
|
||||
|
||||
Third-party Software
|
||||
|
||||
```bash
|
||||
pip install gradio gradio-pdf
|
||||
```
|
||||
|
||||
## Start Gradio App
|
||||
|
||||
```bash
|
||||
python app.py
|
||||
```
|
||||
|
||||
## Use Gradio App
|
||||
|
||||
Access http://127.0.0.1:7860 in your web browser
|
||||
@@ -1,24 +0,0 @@
|
||||
## 安装
|
||||
|
||||
MinerU(>=0.8.0)
|
||||
>如已有正常运行的MinerU环境则可以跳过此步骤
|
||||
>
|
||||
[在CPU环境部署](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8cpu%E5%BF%AB%E9%80%9F%E4%BD%93%E9%AA%8C)
|
||||
|
||||
[在GPU环境部署](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8gpu)
|
||||
|
||||
第三方软件
|
||||
|
||||
```bash
|
||||
pip install gradio gradio-pdf
|
||||
```
|
||||
|
||||
## 启动gradio应用
|
||||
|
||||
```bash
|
||||
python app.py
|
||||
```
|
||||
|
||||
## 使用gradio应用
|
||||
|
||||
在浏览器中访问 http://127.0.0.1:7860
|
||||
@@ -1,212 +0,0 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import gradio as gr
|
||||
from gradio_pdf import PDF
|
||||
from loguru import logger
|
||||
|
||||
from mineru.cli.common import prepare_env, do_parse, read_fn
|
||||
from mineru.utils.hash_utils import str_sha256
|
||||
|
||||
|
||||
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
file_name = f'{str(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
|
||||
pdf_data = read_fn(doc_path)
|
||||
if is_ocr:
|
||||
parse_method = 'ocr'
|
||||
else:
|
||||
parse_method = 'auto'
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
|
||||
do_parse(
|
||||
output_dir=output_dir,
|
||||
pdf_file_names=[file_name],
|
||||
pdf_bytes_list=[pdf_data],
|
||||
p_lang_list=[language],
|
||||
parse_method=parse_method,
|
||||
end_page_id=end_page_id,
|
||||
p_formula_enable=formula_enable,
|
||||
p_table_enable=table_enable,
|
||||
)
|
||||
return local_md_dir, file_name
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
|
||||
|
||||
def compress_directory_to_zip(directory_path, output_zip_path):
|
||||
"""压缩指定目录到一个 ZIP 文件。
|
||||
|
||||
:param directory_path: 要压缩的目录路径
|
||||
:param output_zip_path: 输出的 ZIP 文件路径
|
||||
"""
|
||||
try:
|
||||
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
|
||||
# 遍历目录中的所有文件和子目录
|
||||
for root, dirs, files in os.walk(directory_path):
|
||||
for file in files:
|
||||
# 构建完整的文件路径
|
||||
file_path = os.path.join(root, file)
|
||||
# 计算相对路径
|
||||
arcname = os.path.relpath(file_path, directory_path)
|
||||
# 添加文件到 ZIP 文件
|
||||
zipf.write(file_path, arcname)
|
||||
return 0
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return -1
|
||||
|
||||
|
||||
def image_to_base64(image_path):
|
||||
with open(image_path, 'rb') as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
|
||||
def replace_image_with_base64(markdown_text, image_dir_path):
|
||||
# 匹配Markdown中的图片标签
|
||||
pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
|
||||
|
||||
# 替换图片链接
|
||||
def replace(match):
|
||||
relative_path = match.group(1)
|
||||
full_path = os.path.join(image_dir_path, relative_path)
|
||||
base64_image = image_to_base64(full_path)
|
||||
return f''
|
||||
|
||||
# 应用替换
|
||||
return re.sub(pattern, replace, markdown_text)
|
||||
|
||||
|
||||
def to_markdown(file_path, end_pages, is_ocr, formula_enable, table_enable, language):
|
||||
file_path = to_pdf(file_path)
|
||||
# 获取识别的md文件以及压缩包文件路径
|
||||
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language)
|
||||
archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
|
||||
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
||||
if zip_archive_success == 0:
|
||||
logger.info('压缩成功')
|
||||
else:
|
||||
logger.error('压缩失败')
|
||||
md_path = os.path.join(local_md_dir, file_name + '.md')
|
||||
with open(md_path, 'r', encoding='utf-8') as f:
|
||||
txt_content = f.read()
|
||||
md_content = replace_image_with_base64(txt_content, local_md_dir)
|
||||
# 返回转换后的PDF路径
|
||||
new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
|
||||
|
||||
return md_content, txt_content, archive_zip_path, new_pdf_path
|
||||
|
||||
|
||||
latex_delimiters = [
|
||||
{'left': '$$', 'right': '$$', 'display': True},
|
||||
{'left': '$', 'right': '$', 'display': False},
|
||||
{'left': '\\(', 'right': '\\)', 'display': False},
|
||||
{'left': '\\[', 'right': '\\]', 'display': True},
|
||||
]
|
||||
|
||||
|
||||
with open('header.html', 'r') as file:
|
||||
header = file.read()
|
||||
|
||||
|
||||
latin_lang = [
|
||||
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126
|
||||
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
|
||||
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
|
||||
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
|
||||
]
|
||||
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
||||
cyrillic_lang = [
|
||||
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126
|
||||
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
|
||||
]
|
||||
devanagari_lang = [
|
||||
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126
|
||||
'sa', 'bgc'
|
||||
]
|
||||
other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
||||
add_lang = ['latin', 'arabic', 'cyrillic', 'devanagari']
|
||||
|
||||
# all_lang = ['', 'auto']
|
||||
all_lang = []
|
||||
# all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
||||
all_lang.extend([*other_lang, *add_lang])
|
||||
|
||||
|
||||
def safe_stem(file_path):
|
||||
stem = Path(file_path).stem
|
||||
# 只保留字母、数字、下划线和点,其他字符替换为下划线
|
||||
return re.sub(r'[^\w.]', '_', stem)
|
||||
|
||||
|
||||
def to_pdf(file_path):
|
||||
|
||||
if file_path is None:
|
||||
return None
|
||||
|
||||
pdf_bytes = read_fn(file_path)
|
||||
|
||||
# unique_filename = f'{uuid.uuid4()}.pdf'
|
||||
unique_filename = f'{safe_stem(file_path)}.pdf'
|
||||
|
||||
# 构建完整的文件路径
|
||||
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
||||
|
||||
# 将字节数据写入文件
|
||||
with open(tmp_file_path, 'wb') as tmp_pdf_file:
|
||||
tmp_pdf_file.write(pdf_bytes)
|
||||
|
||||
return tmp_file_path
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with gr.Blocks() as demo:
|
||||
gr.HTML(header)
|
||||
with gr.Row():
|
||||
with gr.Column(variant='panel', scale=5):
|
||||
with gr.Row():
|
||||
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
|
||||
with gr.Row(equal_height=True):
|
||||
with gr.Column(scale=4):
|
||||
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
|
||||
with gr.Column(scale=1):
|
||||
language = gr.Dropdown(all_lang, label='Language', value='ch')
|
||||
with gr.Row():
|
||||
is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
|
||||
formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
|
||||
table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
|
||||
with gr.Row():
|
||||
change_bu = gr.Button('Convert')
|
||||
clear_bu = gr.ClearButton(value='Clear')
|
||||
pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800)
|
||||
with gr.Accordion('Examples:'):
|
||||
example_root = os.path.join(os.path.dirname(__file__), 'examples')
|
||||
gr.Examples(
|
||||
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
||||
_.endswith('pdf')],
|
||||
inputs=file
|
||||
)
|
||||
|
||||
with gr.Column(variant='panel', scale=5):
|
||||
output_file = gr.File(label='convert result', interactive=False)
|
||||
with gr.Tabs():
|
||||
with gr.Tab('Markdown rendering'):
|
||||
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
|
||||
latex_delimiters=latex_delimiters,
|
||||
line_breaks=True)
|
||||
with gr.Tab('Markdown text'):
|
||||
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
||||
file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
|
||||
change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, formula_enable, table_enable, language],
|
||||
outputs=[md, md_text, output_file, pdf_show])
|
||||
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
|
||||
|
||||
demo.launch(server_name='0.0.0.0')
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,130 +0,0 @@
|
||||
<html><head>
|
||||
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
||||
<style>
|
||||
.link-block {
|
||||
border: 1px solid transparent;
|
||||
border-radius: 24px;
|
||||
background-color: rgba(54, 54, 54, 1);
|
||||
cursor: pointer !important;
|
||||
}
|
||||
.link-block:hover {
|
||||
background-color: rgba(54, 54, 54, 0.75) !important;
|
||||
cursor: pointer !important;
|
||||
}
|
||||
.external-link {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
height: 36px;
|
||||
line-height: 36px;
|
||||
padding: 0 16px;
|
||||
cursor: pointer !important;
|
||||
}
|
||||
.external-link,
|
||||
.external-link:hover {
|
||||
cursor: pointer !important;
|
||||
}
|
||||
a {
|
||||
text-decoration: none;
|
||||
}
|
||||
</style></head>
|
||||
|
||||
<body>
|
||||
<div style="
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
text-align: center;
|
||||
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
||||
padding: 24px;
|
||||
gap: 24px;
|
||||
border-radius: 8px;
|
||||
">
|
||||
<div style="
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
gap: 16px;
|
||||
">
|
||||
<div style="display: flex; flex-direction: column; gap: 8px">
|
||||
<h1 style="
|
||||
font-size: 48px;
|
||||
color: #fafafa;
|
||||
margin: 0;
|
||||
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
||||
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
||||
">
|
||||
MinerU: PDF Extraction Demo
|
||||
</h1>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<p style="
|
||||
margin: 0;
|
||||
line-height: 1.6rem;
|
||||
font-size: 16px;
|
||||
color: #fafafa;
|
||||
opacity: 0.8;
|
||||
">
|
||||
A one-stop, open-source, high-quality data extraction tool, supports
|
||||
PDF/webpage/e-book extraction.<br>
|
||||
</p>
|
||||
<style>
|
||||
.link-block {
|
||||
display: inline-block;
|
||||
}
|
||||
.link-block + .link-block {
|
||||
margin-left: 20px;
|
||||
}
|
||||
</style>
|
||||
|
||||
<div class="column has-text-centered">
|
||||
<div class="publication-links">
|
||||
<!-- Code Link. -->
|
||||
<span class="link-block">
|
||||
<a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
||||
<span class="icon" style="margin-right: 4px">
|
||||
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
||||
</span>
|
||||
<span style="color: white">Code</span>
|
||||
</a>
|
||||
</span>
|
||||
|
||||
<!-- arXiv Link. -->
|
||||
<span class="link-block">
|
||||
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
||||
<span class="icon" style="margin-right: 8px">
|
||||
<i class="fas fa-file" style="color: white"></i>
|
||||
</span>
|
||||
<span style="color: white">Paper</span>
|
||||
</a>
|
||||
</span>
|
||||
|
||||
<!-- Homepage Link. -->
|
||||
<span class="link-block">
|
||||
<a href="https://mineru.net/home?source=online" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
||||
<span class="icon" style="margin-right: 8px">
|
||||
<i class="fas fa-home" style="color: white"></i>
|
||||
</span>
|
||||
<span style="color: white">Homepage</span>
|
||||
</a>
|
||||
</span>
|
||||
|
||||
<!-- Client Link. -->
|
||||
<span class="link-block">
|
||||
<a href="https://mineru.net/client?source=online" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
||||
<span class="icon" style="margin-right: 8px">
|
||||
<i class="fas fa-download" style="color: white"></i>
|
||||
</span>
|
||||
<span style="color: white">Download</span>
|
||||
</a>
|
||||
</span>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- New Demo Links -->
|
||||
</div>
|
||||
|
||||
|
||||
</body></html>
|
||||
@@ -1,3 +0,0 @@
|
||||
magic-pdf[full]>=0.8.0
|
||||
gradio
|
||||
gradio-pdf
|
||||
@@ -1,67 +0,0 @@
|
||||
FROM python:3.10-slim-bookworm AS base
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
LANG=C.UTF-8 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_NO_CACHE_DIR=1
|
||||
|
||||
|
||||
FROM base AS build
|
||||
|
||||
# Update the package list and install necessary packages
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Build Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN python -m venv /app/venv && \
|
||||
. /app/venv/bin/activate && \
|
||||
pip install -r requirements.txt
|
||||
# pip uninstall -y paddlepaddle && \
|
||||
# pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
|
||||
# paddlepaddle-gpu==3.0.0rc1
|
||||
|
||||
# Download models
|
||||
COPY download_models.py .
|
||||
RUN . /app/venv/bin/activate && \
|
||||
./download_models.py
|
||||
|
||||
|
||||
FROM base AS prod
|
||||
|
||||
# Copy Python dependencies and models from the build stage
|
||||
COPY --from=build /app/venv /app/venv
|
||||
COPY --from=build /opt/models /opt/models
|
||||
COPY --from=build /opt/layoutreader /opt/layoutreader
|
||||
|
||||
# Update the package list and install necessary packages
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
libgomp1 && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create volume for paddleocr models
|
||||
# RUN mkdir -p /root/.paddleocr
|
||||
# VOLUME [ "/root/.paddleocr" ]
|
||||
|
||||
# Copy the app and its configuration file
|
||||
COPY entrypoint.sh /app/entrypoint.sh
|
||||
COPY magic-pdf.json /root/magic-pdf.json
|
||||
COPY app.py /app/app.py
|
||||
|
||||
# Expose the port that FastAPI will run on
|
||||
EXPOSE 8000
|
||||
|
||||
# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
|
||||
ENTRYPOINT [ "/app/entrypoint.sh" ]
|
||||
CMD ["--host", "0.0.0.0", "--port", "8000"]
|
||||
@@ -1,31 +0,0 @@
|
||||
# 基于MinerU的PDF解析API
|
||||
|
||||
- MinerU的GPU镜像构建
|
||||
- 基于FastAPI的PDF解析接口
|
||||
|
||||
## 构建方式
|
||||
|
||||
```
|
||||
docker build -t mineru-api .
|
||||
```
|
||||
|
||||
或者使用代理:
|
||||
|
||||
```
|
||||
docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_proxy=http://127.0.0.1:7890 -t mineru-api .
|
||||
```
|
||||
|
||||
## 启动命令
|
||||
|
||||
```
|
||||
docker run --rm -it --gpus=all -p 8000:8000 mineru-api
|
||||
```
|
||||
|
||||
## 测试参数
|
||||
|
||||
访问地址:
|
||||
|
||||
```
|
||||
http://localhost:8000/docs
|
||||
http://127.0.0.1:8000/docs
|
||||
```
|
||||
@@ -1,305 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from base64 import b64encode
|
||||
from glob import glob
|
||||
from io import StringIO
|
||||
import tempfile
|
||||
from typing import Tuple, Union
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, HTTPException, UploadFile
|
||||
from fastapi.responses import JSONResponse
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.data.read_api import read_local_images, read_local_office
|
||||
import magic_pdf.model as model_config
|
||||
from magic_pdf.config.enums import SupportedPdfParseMethod
|
||||
from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
|
||||
from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter
|
||||
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
|
||||
from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.operators.models import InferenceResult
|
||||
from magic_pdf.operators.pipes import PipeResult
|
||||
from fastapi import Form
|
||||
|
||||
model_config.__use_inside_model__ = True
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
pdf_extensions = [".pdf"]
|
||||
office_extensions = [".ppt", ".pptx", ".doc", ".docx"]
|
||||
image_extensions = [".png", ".jpg", ".jpeg"]
|
||||
|
||||
class MemoryDataWriter(DataWriter):
|
||||
def __init__(self):
|
||||
self.buffer = StringIO()
|
||||
|
||||
def write(self, path: str, data: bytes) -> None:
|
||||
if isinstance(data, str):
|
||||
self.buffer.write(data)
|
||||
else:
|
||||
self.buffer.write(data.decode("utf-8"))
|
||||
|
||||
def write_string(self, path: str, data: str) -> None:
|
||||
self.buffer.write(data)
|
||||
|
||||
def get_value(self) -> str:
|
||||
return self.buffer.getvalue()
|
||||
|
||||
def close(self):
|
||||
self.buffer.close()
|
||||
|
||||
|
||||
def init_writers(
|
||||
file_path: str = None,
|
||||
file: UploadFile = None,
|
||||
output_path: str = None,
|
||||
output_image_path: str = None,
|
||||
) -> Tuple[
|
||||
Union[S3DataWriter, FileBasedDataWriter],
|
||||
Union[S3DataWriter, FileBasedDataWriter],
|
||||
bytes,
|
||||
]:
|
||||
"""
|
||||
Initialize writers based on path type
|
||||
|
||||
Args:
|
||||
file_path: file path (local path or S3 path)
|
||||
file: Uploaded file object
|
||||
output_path: Output directory path
|
||||
output_image_path: Image output directory path
|
||||
|
||||
Returns:
|
||||
Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content
|
||||
"""
|
||||
file_extension:str = None
|
||||
if file_path:
|
||||
is_s3_path = file_path.startswith("s3://")
|
||||
if is_s3_path:
|
||||
bucket = get_bucket_name(file_path)
|
||||
ak, sk, endpoint = get_s3_config(bucket)
|
||||
|
||||
writer = S3DataWriter(
|
||||
output_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
|
||||
)
|
||||
image_writer = S3DataWriter(
|
||||
output_image_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
|
||||
)
|
||||
# 临时创建reader读取文件内容
|
||||
temp_reader = S3DataReader(
|
||||
"", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
|
||||
)
|
||||
file_bytes = temp_reader.read(file_path)
|
||||
file_extension = os.path.splitext(file_path)[1]
|
||||
else:
|
||||
writer = FileBasedDataWriter(output_path)
|
||||
image_writer = FileBasedDataWriter(output_image_path)
|
||||
os.makedirs(output_image_path, exist_ok=True)
|
||||
with open(file_path, "rb") as f:
|
||||
file_bytes = f.read()
|
||||
file_extension = os.path.splitext(file_path)[1]
|
||||
else:
|
||||
# 处理上传的文件
|
||||
file_bytes = file.file.read()
|
||||
file_extension = os.path.splitext(file.filename)[1]
|
||||
|
||||
writer = FileBasedDataWriter(output_path)
|
||||
image_writer = FileBasedDataWriter(output_image_path)
|
||||
os.makedirs(output_image_path, exist_ok=True)
|
||||
|
||||
return writer, image_writer, file_bytes, file_extension
|
||||
|
||||
|
||||
def process_file(
|
||||
file_bytes: bytes,
|
||||
file_extension: str,
|
||||
parse_method: str,
|
||||
image_writer: Union[S3DataWriter, FileBasedDataWriter],
|
||||
) -> Tuple[InferenceResult, PipeResult]:
|
||||
"""
|
||||
Process PDF file content
|
||||
|
||||
Args:
|
||||
file_bytes: Binary content of file
|
||||
file_extension: file extension
|
||||
parse_method: Parse method ('ocr', 'txt', 'auto')
|
||||
image_writer: Image writer
|
||||
|
||||
Returns:
|
||||
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
|
||||
"""
|
||||
|
||||
ds: Union[PymuDocDataset, ImageDataset] = None
|
||||
if file_extension in pdf_extensions:
|
||||
ds = PymuDocDataset(file_bytes)
|
||||
elif file_extension in office_extensions:
|
||||
# 需要使用office解析
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
|
||||
f.write(file_bytes)
|
||||
ds = read_local_office(temp_dir)[0]
|
||||
elif file_extension in image_extensions:
|
||||
# 需要使用ocr解析
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
|
||||
f.write(file_bytes)
|
||||
ds = read_local_images(temp_dir)[0]
|
||||
infer_result: InferenceResult = None
|
||||
pipe_result: PipeResult = None
|
||||
|
||||
if parse_method == "ocr":
|
||||
infer_result = ds.apply(doc_analyze, ocr=True)
|
||||
pipe_result = infer_result.pipe_ocr_mode(image_writer)
|
||||
elif parse_method == "txt":
|
||||
infer_result = ds.apply(doc_analyze, ocr=False)
|
||||
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
||||
else: # auto
|
||||
if ds.classify() == SupportedPdfParseMethod.OCR:
|
||||
infer_result = ds.apply(doc_analyze, ocr=True)
|
||||
pipe_result = infer_result.pipe_ocr_mode(image_writer)
|
||||
else:
|
||||
infer_result = ds.apply(doc_analyze, ocr=False)
|
||||
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
||||
|
||||
return infer_result, pipe_result
|
||||
|
||||
|
||||
def encode_image(image_path: str) -> str:
|
||||
"""Encode image using base64"""
|
||||
with open(image_path, "rb") as f:
|
||||
return b64encode(f.read()).decode()
|
||||
|
||||
|
||||
@app.post(
|
||||
"/file_parse",
|
||||
tags=["projects"],
|
||||
summary="Parse files (supports local files and S3)",
|
||||
)
|
||||
async def file_parse(
|
||||
file: UploadFile = None,
|
||||
file_path: str = Form(None),
|
||||
parse_method: str = Form("auto"),
|
||||
is_json_md_dump: bool = Form(False),
|
||||
output_dir: str = Form("output"),
|
||||
return_layout: bool = Form(False),
|
||||
return_info: bool = Form(False),
|
||||
return_content_list: bool = Form(False),
|
||||
return_images: bool = Form(False),
|
||||
):
|
||||
"""
|
||||
Execute the process of converting PDF to JSON and MD, outputting MD and JSON files
|
||||
to the specified directory.
|
||||
|
||||
Args:
|
||||
file: The PDF file to be parsed. Must not be specified together with
|
||||
`file_path`
|
||||
file_path: The path to the PDF file to be parsed. Must not be specified together
|
||||
with `file`
|
||||
parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
|
||||
results are not satisfactory, try ocr
|
||||
is_json_md_dump: Whether to write parsed data to .json and .md files. Default
|
||||
to False. Different stages of data will be written to different .json files
|
||||
(3 in total), md content will be saved to .md file
|
||||
output_dir: Output directory for results. A folder named after the PDF file
|
||||
will be created to store all results
|
||||
return_layout: Whether to return parsed PDF layout. Default to False
|
||||
return_info: Whether to return parsed PDF info. Default to False
|
||||
return_content_list: Whether to return parsed PDF content list. Default to False
|
||||
"""
|
||||
try:
|
||||
if (file is None and file_path is None) or (
|
||||
file is not None and file_path is not None
|
||||
):
|
||||
return JSONResponse(
|
||||
content={"error": "Must provide either file or file_path"},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Get PDF filename
|
||||
file_name = os.path.basename(file_path if file_path else file.filename).split(
|
||||
"."
|
||||
)[0]
|
||||
output_path = f"{output_dir}/{file_name}"
|
||||
output_image_path = f"{output_path}/images"
|
||||
|
||||
# Initialize readers/writers and get PDF content
|
||||
writer, image_writer, file_bytes, file_extension = init_writers(
|
||||
file_path=file_path,
|
||||
file=file,
|
||||
output_path=output_path,
|
||||
output_image_path=output_image_path,
|
||||
)
|
||||
|
||||
# Process PDF
|
||||
infer_result, pipe_result = process_file(file_bytes, file_extension, parse_method, image_writer)
|
||||
|
||||
# Use MemoryDataWriter to get results
|
||||
content_list_writer = MemoryDataWriter()
|
||||
md_content_writer = MemoryDataWriter()
|
||||
middle_json_writer = MemoryDataWriter()
|
||||
|
||||
# Use PipeResult's dump method to get data
|
||||
pipe_result.dump_content_list(content_list_writer, "", "images")
|
||||
pipe_result.dump_md(md_content_writer, "", "images")
|
||||
pipe_result.dump_middle_json(middle_json_writer, "")
|
||||
|
||||
# Get content
|
||||
content_list = json.loads(content_list_writer.get_value())
|
||||
md_content = md_content_writer.get_value()
|
||||
middle_json = json.loads(middle_json_writer.get_value())
|
||||
model_json = infer_result.get_infer_res()
|
||||
|
||||
# If results need to be saved
|
||||
if is_json_md_dump:
|
||||
writer.write_string(
|
||||
f"{file_name}_content_list.json", content_list_writer.get_value()
|
||||
)
|
||||
writer.write_string(f"{file_name}.md", md_content)
|
||||
writer.write_string(
|
||||
f"{file_name}_middle.json", middle_json_writer.get_value()
|
||||
)
|
||||
writer.write_string(
|
||||
f"{file_name}_model.json",
|
||||
json.dumps(model_json, indent=4, ensure_ascii=False),
|
||||
)
|
||||
# Save visualization results
|
||||
pipe_result.draw_layout(os.path.join(output_path, f"{file_name}_layout.pdf"))
|
||||
pipe_result.draw_span(os.path.join(output_path, f"{file_name}_spans.pdf"))
|
||||
pipe_result.draw_line_sort(
|
||||
os.path.join(output_path, f"{file_name}_line_sort.pdf")
|
||||
)
|
||||
infer_result.draw_model(os.path.join(output_path, f"{file_name}_model.pdf"))
|
||||
|
||||
# Build return data
|
||||
data = {}
|
||||
if return_layout:
|
||||
data["layout"] = model_json
|
||||
if return_info:
|
||||
data["info"] = middle_json
|
||||
if return_content_list:
|
||||
data["content_list"] = content_list
|
||||
if return_images:
|
||||
image_paths = glob(f"{output_image_path}/*.jpg")
|
||||
data["images"] = {
|
||||
os.path.basename(
|
||||
image_path
|
||||
): f"data:image/jpeg;base64,{encode_image(image_path)}"
|
||||
for image_path in image_paths
|
||||
}
|
||||
data["md_content"] = md_content # md_content is always returned
|
||||
|
||||
# Clean up memory writers
|
||||
content_list_writer.close()
|
||||
md_content_writer.close()
|
||||
middle_json_writer.close()
|
||||
|
||||
return JSONResponse(data, status_code=200)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=8888)
|
||||
@@ -1,33 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
mineru_patterns = [
|
||||
# "models/Layout/LayoutLMv3/*",
|
||||
"models/Layout/YOLO/*",
|
||||
"models/MFD/YOLO/*",
|
||||
"models/MFR/unimernet_hf_small_2503/*",
|
||||
"models/OCR/paddleocr_torch/*",
|
||||
# "models/TabRec/TableMaster/*",
|
||||
# "models/TabRec/StructEqTable/*",
|
||||
]
|
||||
model_dir = snapshot_download(
|
||||
"opendatalab/PDF-Extract-Kit-1.0",
|
||||
allow_patterns=mineru_patterns,
|
||||
local_dir="/opt/",
|
||||
)
|
||||
|
||||
layoutreader_pattern = [
|
||||
"*.json",
|
||||
"*.safetensors",
|
||||
]
|
||||
layoutreader_model_dir = snapshot_download(
|
||||
"hantian/layoutreader",
|
||||
allow_patterns=layoutreader_pattern,
|
||||
local_dir="/opt/layoutreader/",
|
||||
)
|
||||
|
||||
model_dir = model_dir + "/models"
|
||||
print(f"model_dir is: {model_dir}")
|
||||
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
|
||||
@@ -1,5 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
. /app/venv/bin/activate
|
||||
exec uvicorn app:app "$@"
|
||||
@@ -1,44 +0,0 @@
|
||||
{
|
||||
"bucket_info":{
|
||||
"bucket-name-1":["ak", "sk", "endpoint"],
|
||||
"bucket-name-2":["ak", "sk", "endpoint"]
|
||||
},
|
||||
"models-dir":"/opt/models",
|
||||
"layoutreader-model-dir":"/opt/layoutreader",
|
||||
"device-mode":"cuda",
|
||||
"layout-config": {
|
||||
"model": "doclayout_yolo"
|
||||
},
|
||||
"formula-config": {
|
||||
"mfd_model": "yolo_v8_mfd",
|
||||
"mfr_model": "unimernet_small",
|
||||
"enable": true
|
||||
},
|
||||
"table-config": {
|
||||
"model": "rapid_table",
|
||||
"sub_model": "slanet_plus",
|
||||
"enable": true,
|
||||
"max_time": 400
|
||||
},
|
||||
"llm-aided-config": {
|
||||
"formula_aided": {
|
||||
"api_key": "your_api_key",
|
||||
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
"model": "qwen2.5-7b-instruct",
|
||||
"enable": false
|
||||
},
|
||||
"text_aided": {
|
||||
"api_key": "your_api_key",
|
||||
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
"model": "qwen2.5-7b-instruct",
|
||||
"enable": false
|
||||
},
|
||||
"title_aided": {
|
||||
"api_key": "your_api_key",
|
||||
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
"model": "qwen2.5-32b-instruct",
|
||||
"enable": false
|
||||
}
|
||||
},
|
||||
"config_version": "1.2.0"
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
magic-pdf[full]
|
||||
|
||||
fastapi
|
||||
uvicorn
|
||||
python-multipart
|
||||
Reference in New Issue
Block a user