Files
MinerU/mineru/cli/visualization.py

90 lines
2.8 KiB
Python

import json
from dataclasses import dataclass
from pathlib import Path
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
VISUALIZATION_FINISHED = "finished"
VISUALIZATION_SKIPPED = "skipped"
@dataclass(frozen=True)
class VisualizationJob:
document_stem: str
backend: str
parse_method: str
parse_dir: Path
draw_span: bool
@dataclass(frozen=True)
class VisualizationResult:
document_stem: str
parse_dir: Path
status: str
message: str
generated_files: tuple[str, ...] = ()
def run_visualization_job(job: VisualizationJob) -> VisualizationResult:
middle_json_path = job.parse_dir / f"{job.document_stem}_middle.json"
origin_pdf_path = job.parse_dir / f"{job.document_stem}_origin.pdf"
if not middle_json_path.exists():
return VisualizationResult(
document_stem=job.document_stem,
parse_dir=job.parse_dir,
status=VISUALIZATION_SKIPPED,
message=f"missing middle.json: {middle_json_path.name}",
)
if not origin_pdf_path.exists():
return VisualizationResult(
document_stem=job.document_stem,
parse_dir=job.parse_dir,
status=VISUALIZATION_SKIPPED,
message=f"missing origin.pdf: {origin_pdf_path.name}",
)
try:
payload = json.loads(middle_json_path.read_text(encoding="utf-8"))
except Exception as exc:
return VisualizationResult(
document_stem=job.document_stem,
parse_dir=job.parse_dir,
status=VISUALIZATION_SKIPPED,
message=f"failed to read middle.json: {exc}",
)
pdf_info = payload.get("pdf_info")
if not isinstance(pdf_info, list):
return VisualizationResult(
document_stem=job.document_stem,
parse_dir=job.parse_dir,
status=VISUALIZATION_SKIPPED,
message="invalid middle.json: missing pdf_info",
)
try:
pdf_bytes = origin_pdf_path.read_bytes()
generated_files = [f"{job.document_stem}_layout.pdf"]
draw_layout_bbox(pdf_info, pdf_bytes, str(job.parse_dir), generated_files[0])
if job.draw_span:
generated_files.append(f"{job.document_stem}_span.pdf")
draw_span_bbox(pdf_info, pdf_bytes, str(job.parse_dir), generated_files[1])
except Exception as exc:
return VisualizationResult(
document_stem=job.document_stem,
parse_dir=job.parse_dir,
status=VISUALIZATION_SKIPPED,
message=f"visualization failed: {exc}",
)
return VisualizationResult(
document_stem=job.document_stem,
parse_dir=job.parse_dir,
status=VISUALIZATION_FINISHED,
message="generated visualization pdf(s)",
generated_files=tuple(generated_files),
)