mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat: mineru web and web_api
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -38,4 +38,5 @@ source.dev.env
|
|||||||
|
|
||||||
tmp
|
tmp
|
||||||
|
|
||||||
projects/web/node_modules
|
projects/web/node_modules
|
||||||
|
projects/web/dist
|
||||||
|
|||||||
@@ -8,5 +8,5 @@ npm install -g pnpm
|
|||||||
3. build
|
3. build
|
||||||
```
|
```
|
||||||
1.pnpm run build
|
1.pnpm run build
|
||||||
2.npm run buil
|
2.npm run build
|
||||||
```
|
```
|
||||||
@@ -22,9 +22,11 @@
|
|||||||
"ahooks": "^3.8.1",
|
"ahooks": "^3.8.1",
|
||||||
"antd": "^5.20.3",
|
"antd": "^5.20.3",
|
||||||
"axios": "^1.7.5",
|
"axios": "^1.7.5",
|
||||||
|
"canvas": "^2.11.2",
|
||||||
"classnames": "^2.5.1",
|
"classnames": "^2.5.1",
|
||||||
"js-cookie": "^3.0.5",
|
"js-cookie": "^3.0.5",
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
|
"path2d": "^0.2.1",
|
||||||
"qs": "^6.13.0",
|
"qs": "^6.13.0",
|
||||||
"react": "^18.3.1",
|
"react": "^18.3.1",
|
||||||
"react-copy-to-clipboard": "^5.1.0",
|
"react-copy-to-clipboard": "^5.1.0",
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
import ErrorBoundary from "@/components/error-boundary";
|
import ErrorBoundary from "@/components/error-boundary";
|
||||||
import styles from "./home.module.scss";
|
import styles from "./home.module.scss";
|
||||||
import { SlotID, Path } from "@/constant/route";
|
import { SlotID, Path } from "@/constant/route";
|
||||||
import { BrowserRouter, Routes, Route, Outlet } from "react-router-dom";
|
import { HashRouter, Routes, Route, Outlet } from "react-router-dom";
|
||||||
import { ExtractorSide } from "./extract-side";
|
import { ExtractorSide } from "./extract-side";
|
||||||
import { LanguageProvider } from "@/context/language-provider";
|
import { LanguageProvider } from "@/context/language-provider";
|
||||||
import PDFUpload from "@/pages/extract/components/pdf-upload";
|
import PDFUpload from "@/pages/extract/components/pdf-upload";
|
||||||
@@ -70,9 +70,9 @@ export function Home() {
|
|||||||
return (
|
return (
|
||||||
<ErrorBoundary>
|
<ErrorBoundary>
|
||||||
<LanguageProvider>
|
<LanguageProvider>
|
||||||
<BrowserRouter>
|
<HashRouter>
|
||||||
<Screen />
|
<Screen />
|
||||||
</BrowserRouter>
|
</HashRouter>
|
||||||
</LanguageProvider>
|
</LanguageProvider>
|
||||||
</ErrorBoundary>
|
</ErrorBoundary>
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,34 +1,43 @@
|
|||||||
## 安装
|
## Mineru 本地API服务
|
||||||
|
|
||||||
MinerU
|
MinerU
|
||||||
|
|
||||||
```bash
|
```
|
||||||
# mineru已安装则跳过此步骤
|
# 服务依赖mineru,请先确保mineru已安装
|
||||||
|
|
||||||
git clone https://github.com/opendatalab/MinerU.git
|
|
||||||
cd MinerU
|
|
||||||
|
|
||||||
conda create -n MinerU python=3.10
|
|
||||||
conda activate MinerU
|
|
||||||
pip install .[full] --extra-index-url https://wheels.myhloli.com
|
|
||||||
```
|
```
|
||||||
|
|
||||||
第三方软件
|
1. 打包前端界面
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# 先进入前端目录
|
||||||
|
cd projects/web
|
||||||
|
# 打包前端项目
|
||||||
|
npm install -g yarn
|
||||||
|
yarn install
|
||||||
|
yarn build
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 安装服务依赖
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 先进入后端目录
|
||||||
cd projects/web_api
|
cd projects/web_api
|
||||||
pip install poetry
|
# 安装依赖
|
||||||
portey install
|
pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
```
|
```
|
||||||
|
|
||||||
启动服务
|
3. 启动服务
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd web_api
|
# 进入程序目录
|
||||||
python app.py
|
cd projects/web_api/web_api
|
||||||
|
# 启动服务
|
||||||
|
python3 app.py
|
||||||
|
# 在浏览器访问启动的地址即可访问界面
|
||||||
```
|
```
|
||||||
|
|
||||||
接口文档
|
ps:接口文档
|
||||||
|
|
||||||
```
|
```
|
||||||
在浏览器打开 mineru-web接口文档.html
|
在浏览器打开 mineru-web接口文档.html
|
||||||
```
|
```
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
13
projects/web_api/requirements.txt
Normal file
13
projects/web_api/requirements.txt
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
flask-cors
|
||||||
|
flask-jwt-extended
|
||||||
|
flask-marshmallow
|
||||||
|
flask-migrate
|
||||||
|
flask-restful
|
||||||
|
flask-sqlalchemy
|
||||||
|
flask
|
||||||
|
greenlet
|
||||||
|
loguru
|
||||||
|
marshmallow-sqlalchemy
|
||||||
|
marshmallow
|
||||||
|
pyjwt
|
||||||
|
pyyaml
|
||||||
@@ -4,7 +4,7 @@ from common.web_hook import before_request
|
|||||||
from common.logger import setup_log
|
from common.logger import setup_log
|
||||||
|
|
||||||
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
print("root_dir", root_dir)
|
|
||||||
|
|
||||||
def _register_db(flask_app):
|
def _register_db(flask_app):
|
||||||
from common import import_models
|
from common import import_models
|
||||||
@@ -30,6 +30,8 @@ def create_app(config):
|
|||||||
ma.init_app(app=app)
|
ma.init_app(app=app)
|
||||||
from .analysis import analysis_blue
|
from .analysis import analysis_blue
|
||||||
app.register_blueprint(analysis_blue)
|
app.register_blueprint(analysis_blue)
|
||||||
|
from .react_app import react_app_blue
|
||||||
|
app.register_blueprint(react_app_blue)
|
||||||
|
|
||||||
app.before_request(before_request)
|
app.before_request(before_request)
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from .upload_view import UploadPdfView
|
|||||||
from .analysis_view import AnalysisTaskView, AnalysisTaskProgressView
|
from .analysis_view import AnalysisTaskView, AnalysisTaskProgressView
|
||||||
from .img_md_view import ImgView, MdView
|
from .img_md_view import ImgView, MdView
|
||||||
from .task_view import TaskView, HistoricalTasksView, DeleteTaskView
|
from .task_view import TaskView, HistoricalTasksView, DeleteTaskView
|
||||||
|
from .markdown_view import MarkdownView
|
||||||
|
|
||||||
analysis_blue = Blueprint('analysis', __name__)
|
analysis_blue = Blueprint('analysis', __name__)
|
||||||
|
|
||||||
@@ -15,4 +16,5 @@ api_v2.add_resource(ImgView, '/analysis/pdf_img')
|
|||||||
api_v2.add_resource(MdView, '/analysis/pdf_md')
|
api_v2.add_resource(MdView, '/analysis/pdf_md')
|
||||||
api_v2.add_resource(TaskView, '/extract/taskQueue')
|
api_v2.add_resource(TaskView, '/extract/taskQueue')
|
||||||
api_v2.add_resource(HistoricalTasksView, '/extract/list')
|
api_v2.add_resource(HistoricalTasksView, '/extract/list')
|
||||||
api_v2.add_resource(DeleteTaskView, '/extract/task/<int:id>')
|
api_v2.add_resource(DeleteTaskView, '/extract/task/<int:id>')
|
||||||
|
api_v2.add_resource(MarkdownView, '/extract/markdown')
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
|
from multiprocessing import Process
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from flask import request, current_app, url_for
|
from flask import request, current_app, url_for
|
||||||
from flask_restful import Resource
|
from flask_restful import Resource
|
||||||
@@ -212,10 +213,10 @@ class AnalysisTaskView(Resource):
|
|||||||
pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
|
pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
|
||||||
pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}"
|
pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}"
|
||||||
image_dir = f"{pdf_dir}/images"
|
image_dir = f"{pdf_dir}/images"
|
||||||
t = threading.Thread(target=analysis_pdf_task,
|
process = Process(target=analysis_pdf_task,
|
||||||
args=(pdf_dir, image_dir, file_path, analysis_task.is_ocr,
|
args=(pdf_dir, image_dir, file_path, analysis_task.is_ocr,
|
||||||
analysis_task.analysis_pdf_id))
|
analysis_task.analysis_pdf_id))
|
||||||
t.start()
|
process.start()
|
||||||
|
|
||||||
# 生成文件的URL路径
|
# 生成文件的URL路径
|
||||||
file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)
|
file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)
|
||||||
|
|||||||
44
projects/web_api/web_api/api/analysis/markdown_view.py
Normal file
44
projects/web_api/web_api/api/analysis/markdown_view.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from flask import request, current_app
|
||||||
|
from flask_restful import Resource
|
||||||
|
from common.custom_response import generate_response
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownView(Resource):
|
||||||
|
|
||||||
|
def put(self):
|
||||||
|
"""
|
||||||
|
编辑markdown
|
||||||
|
"""
|
||||||
|
params = json.loads(request.data)
|
||||||
|
file_key = params.get('file_key')
|
||||||
|
data = params.get('data', {})
|
||||||
|
if not data:
|
||||||
|
return generate_response(code=400, msg="empty data", msgZH="数据为空,无法更新markdown")
|
||||||
|
|
||||||
|
pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
|
||||||
|
pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}"
|
||||||
|
markdown_file_dir = ""
|
||||||
|
for path_obj in Path(pdf_dir).iterdir():
|
||||||
|
if path_obj.name.startswith(file_key):
|
||||||
|
markdown_file_dir = path_obj
|
||||||
|
break
|
||||||
|
|
||||||
|
if markdown_file_dir and Path(markdown_file_dir).exists():
|
||||||
|
for k, v in data.items():
|
||||||
|
md_path = f"{markdown_file_dir}/{k}.md"
|
||||||
|
if Path(md_path).exists():
|
||||||
|
with open(md_path, 'w', encoding="utf-8") as f:
|
||||||
|
f.write(v)
|
||||||
|
|
||||||
|
full_content = ""
|
||||||
|
for path_obj in Path(markdown_file_dir).iterdir():
|
||||||
|
if path_obj.is_file() and path_obj.suffix == ".md" and path_obj.stem != "full":
|
||||||
|
with open(path_obj, 'r', encoding="utf-8") as f:
|
||||||
|
full_content += f.read() + "\n"
|
||||||
|
with open(f"{markdown_file_dir}/full.md", 'w', encoding="utf-8") as f:
|
||||||
|
f.write(full_content)
|
||||||
|
else:
|
||||||
|
return generate_response(code=400, msg="Invalid file_key", msgZH="文件哈希错误")
|
||||||
|
return generate_response()
|
||||||
@@ -59,3 +59,4 @@ db = SQLAlchemy()
|
|||||||
migrate = Migrate()
|
migrate = Migrate()
|
||||||
jwt = JWTManager()
|
jwt = JWTManager()
|
||||||
ma = Marshmallow()
|
ma = Marshmallow()
|
||||||
|
folder = app.config.get("REACT_APP_DIST")
|
||||||
|
|||||||
11
projects/web_api/web_api/api/react_app/__init__.py
Normal file
11
projects/web_api/web_api/api/react_app/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from flask import Blueprint
|
||||||
|
from ..extentions import app, Api
|
||||||
|
from .react_app_view import ReactAppView
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
folder = Path(app.config.get("REACT_APP_DIST", "../../web/dist/")).resolve()
|
||||||
|
logger.info(f"react_app folder: {folder}")
|
||||||
|
react_app_blue = Blueprint('react_app', __name__, static_folder=folder, static_url_path='', template_folder=folder)
|
||||||
|
react_app_api = Api(react_app_blue, prefix='')
|
||||||
|
react_app_api.add_resource(ReactAppView, '/')
|
||||||
11
projects/web_api/web_api/api/react_app/react_app_view.py
Normal file
11
projects/web_api/web_api/api/react_app/react_app_view.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from flask import render_template, Response
|
||||||
|
from flask_restful import Resource
|
||||||
|
|
||||||
|
|
||||||
|
class ReactAppView(Resource):
|
||||||
|
def get(self):
|
||||||
|
# 创建自定义的响应对象
|
||||||
|
rendered_template = render_template('index.html')
|
||||||
|
response = Response(rendered_template, mimetype='text/html')
|
||||||
|
|
||||||
|
return response
|
||||||
@@ -11,6 +11,8 @@ BaseConfig: &base
|
|||||||
JWT_ACCESS_TOKEN_EXPIRES: 3600
|
JWT_ACCESS_TOKEN_EXPIRES: 3600
|
||||||
PDF_UPLOAD_FOLDER: "upload_pdf"
|
PDF_UPLOAD_FOLDER: "upload_pdf"
|
||||||
PDF_ANALYSIS_FOLDER: "analysis_pdf"
|
PDF_ANALYSIS_FOLDER: "analysis_pdf"
|
||||||
|
# 前端项目打包的路径
|
||||||
|
REACT_APP_DIST: "../../web/dist/"
|
||||||
|
|
||||||
# 开发配置
|
# 开发配置
|
||||||
DevelopmentConfig:
|
DevelopmentConfig:
|
||||||
|
|||||||
Binary file not shown.
Reference in New Issue
Block a user