feat: enhance DOCX processing by refining image handling and improving logging for inference timing

2026-03-27 11:08:32 +07:00 · 2026-01-06 20:04:06 +08:00
parent 0cbe965d97
commit ad175df3d2
3 changed files with 13 additions and 5 deletions
--- a/demo/demo.py
+++ b/demo/demo.py
@@ -218,7 +218,7 @@ def _process_output(
        make_func = pipeline_union_make
    elif process_mode == "vlm":
        make_func = vlm_union_make
-    elif process_mode == "office":
+    elif process_mode in office_suffixes:
        make_func = office_union_make
    else:
        raise Exception(f"Unknown process_mode: {process_mode}")
--- a/mineru/backend/office/docx_analyze.py
+++ b/mineru/backend/office/docx_analyze.py
@@ -17,12 +17,13 @@ def office_docx_analyze(
    results = convert_binary(file_stream)

    infer_time = round(time.time() - infer_start, 2)
-    logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results) / infer_time, 3)} page/s")
+    safe_time = max(infer_time, 0.01)
+    logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results) / safe_time, 3)} page/s")

    # middle_json = result_to_middle_json(
    #     results,
    #     image_writer,
    # )
-    middle_json= []
+    middle_json= {"pdf_info": results}

    return middle_json, results
--- a/mineru/model/docx/docx_converter.py
+++ b/mineru/model/docx/docx_converter.py
@@ -5,7 +5,7 @@ from pathlib import Path
 from typing import BinaryIO, Optional, Union, Any, Final

 import logging
-from PIL import Image
+from PIL import Image, WmfImagePlugin
 from loguru import logger
 from docx import Document
 from docx.oxml.xmlchemy import BaseOxmlElement
@@ -466,7 +466,14 @@ class DocxConverter:
        else:
            image_bytes = BytesIO(image_data)
            pil_image = Image.open(image_bytes)
-            img_base64 = image_to_b64str(pil_image)
+            if isinstance(pil_image, WmfImagePlugin.WmfStubImageFile):
+                logger.warning(f"Skipping WMF image, size: {pil_image.size}")
+                placeholder = Image.new('RGB', pil_image.size, (240, 240, 240))
+                img_base64 = image_to_b64str(placeholder)
+            else:
+                if pil_image.mode != "RGB":
+                    pil_image = pil_image.convert("RGB")
+                img_base64 = image_to_b64str(pil_image)
            image_block = {
                "type": BlockType.IMAGE,
                "bbox": [0, 0, 0, 0],