feat: add support for DOCX file format in converter

This commit is contained in:
myhloli
2026-01-05 17:44:17 +08:00
parent 66f8f0e93a
commit f44fb174ea
6 changed files with 6 additions and 27 deletions

View File

@@ -0,0 +1 @@
# Copyright (c) Opendatalab. All rights reserved.

View File

@@ -0,0 +1 @@
# Copyright (c) Opendatalab. All rights reserved.

View File

@@ -28,6 +28,7 @@ if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
pdf_suffixes = ["pdf"]
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
docx_suffixes = ["docx"]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -39,7 +40,7 @@ def read_fn(path):
file_suffix = guess_suffix_by_bytes(file_bytes, path)
if file_suffix in image_suffixes:
return images_bytes_to_pdf_bytes(file_bytes)
elif file_suffix in pdf_suffixes:
elif file_suffix in pdf_suffixes + docx_suffixes:
return file_bytes
else:
raise Exception(f"Unknown file suffix: {file_suffix}")

View File

@@ -5,7 +5,6 @@ from pathlib import Path
from typing import BinaryIO, Optional, Union, Any, Final
import logging
import filetype
from PIL import Image, UnidentifiedImageError
from loguru import logger
from docx import Document
@@ -14,8 +13,6 @@ from docx.text.paragraph import Paragraph
from docx.text.hyperlink import Hyperlink
from docx.text.run import Run
from lxml import etree
from markdown_it.common.html_re import attribute
from markdown_it.rules_block import list_block
from pydantic import AnyUrl
from mineru.model.utils.docx.mammoth import body_xml
@@ -87,23 +84,6 @@ class DocxConverter:
self.equation_bookends: str = "<eq>{EQ}</eq>" # 公式标记格式
Path.mkdir(self.output_path, parents=True, exist_ok=True)
def accepts(
self,
file_stream: BinaryIO,
) -> bool:
mimetype = filetype.guess_mime(file_stream).lower()
if mimetype is None:
logger.error(f"Failed to detect mimetype for {self.file_path}")
extension = os.path.splitext(self.file_path)[1].lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,

View File

@@ -9,8 +9,6 @@ def convert_path(file_path: str):
def convert_binary(file_binary: BinaryIO, file_path: str):
converter = DocxConverter(file_path=file_path, output_path="./output")
if not converter.accepts(file_binary):
raise Exception("Not a docx file")
return converter.convert(file_binary)

View File

@@ -41,10 +41,8 @@ dependencies = [
"magika>=0.6.2,<1.1.0",
"mineru-vl-utils>=0.1.19.1,<1",
"qwen-vl-utils>=0.0.14,<1",
"mammoth>=1.11.0,<2",
"defusedxml",
"filetype",
"markdownify",
"defusedxml>=0.7.1,<1",
"python-docx>=1.2.0,<2",
]
[project.optional-dependencies]