mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-28 11:38:32 +07:00
90 lines
2.8 KiB
Python
90 lines
2.8 KiB
Python
import json
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
|
|
|
|
|
|
VISUALIZATION_FINISHED = "finished"
|
|
VISUALIZATION_SKIPPED = "skipped"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class VisualizationJob:
|
|
document_stem: str
|
|
backend: str
|
|
parse_method: str
|
|
parse_dir: Path
|
|
draw_span: bool
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class VisualizationResult:
|
|
document_stem: str
|
|
parse_dir: Path
|
|
status: str
|
|
message: str
|
|
generated_files: tuple[str, ...] = ()
|
|
|
|
|
|
def run_visualization_job(job: VisualizationJob) -> VisualizationResult:
|
|
middle_json_path = job.parse_dir / f"{job.document_stem}_middle.json"
|
|
origin_pdf_path = job.parse_dir / f"{job.document_stem}_origin.pdf"
|
|
|
|
if not middle_json_path.exists():
|
|
return VisualizationResult(
|
|
document_stem=job.document_stem,
|
|
parse_dir=job.parse_dir,
|
|
status=VISUALIZATION_SKIPPED,
|
|
message=f"missing middle.json: {middle_json_path.name}",
|
|
)
|
|
if not origin_pdf_path.exists():
|
|
return VisualizationResult(
|
|
document_stem=job.document_stem,
|
|
parse_dir=job.parse_dir,
|
|
status=VISUALIZATION_SKIPPED,
|
|
message=f"missing origin.pdf: {origin_pdf_path.name}",
|
|
)
|
|
|
|
try:
|
|
payload = json.loads(middle_json_path.read_text(encoding="utf-8"))
|
|
except Exception as exc:
|
|
return VisualizationResult(
|
|
document_stem=job.document_stem,
|
|
parse_dir=job.parse_dir,
|
|
status=VISUALIZATION_SKIPPED,
|
|
message=f"failed to read middle.json: {exc}",
|
|
)
|
|
|
|
pdf_info = payload.get("pdf_info")
|
|
if not isinstance(pdf_info, list):
|
|
return VisualizationResult(
|
|
document_stem=job.document_stem,
|
|
parse_dir=job.parse_dir,
|
|
status=VISUALIZATION_SKIPPED,
|
|
message="invalid middle.json: missing pdf_info",
|
|
)
|
|
|
|
try:
|
|
pdf_bytes = origin_pdf_path.read_bytes()
|
|
generated_files = [f"{job.document_stem}_layout.pdf"]
|
|
draw_layout_bbox(pdf_info, pdf_bytes, str(job.parse_dir), generated_files[0])
|
|
if job.draw_span:
|
|
generated_files.append(f"{job.document_stem}_span.pdf")
|
|
draw_span_bbox(pdf_info, pdf_bytes, str(job.parse_dir), generated_files[1])
|
|
except Exception as exc:
|
|
return VisualizationResult(
|
|
document_stem=job.document_stem,
|
|
parse_dir=job.parse_dir,
|
|
status=VISUALIZATION_SKIPPED,
|
|
message=f"visualization failed: {exc}",
|
|
)
|
|
|
|
return VisualizationResult(
|
|
document_stem=job.document_stem,
|
|
parse_dir=job.parse_dir,
|
|
status=VISUALIZATION_FINISHED,
|
|
message="generated visualization pdf(s)",
|
|
generated_files=tuple(generated_files),
|
|
)
|