mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat: add support for DOCX file format in converter
This commit is contained in:
1
mineru/backend/office/__init__.py
Normal file
1
mineru/backend/office/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
1
mineru/backend/office/docx.py
Normal file
1
mineru/backend/office/docx.py
Normal file
@@ -0,0 +1 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
@@ -28,6 +28,7 @@ if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
||||
|
||||
pdf_suffixes = ["pdf"]
|
||||
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
|
||||
docx_suffixes = ["docx"]
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
@@ -39,7 +40,7 @@ def read_fn(path):
|
||||
file_suffix = guess_suffix_by_bytes(file_bytes, path)
|
||||
if file_suffix in image_suffixes:
|
||||
return images_bytes_to_pdf_bytes(file_bytes)
|
||||
elif file_suffix in pdf_suffixes:
|
||||
elif file_suffix in pdf_suffixes + docx_suffixes:
|
||||
return file_bytes
|
||||
else:
|
||||
raise Exception(f"Unknown file suffix: {file_suffix}")
|
||||
|
||||
@@ -5,7 +5,6 @@ from pathlib import Path
|
||||
from typing import BinaryIO, Optional, Union, Any, Final
|
||||
|
||||
import logging
|
||||
import filetype
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from loguru import logger
|
||||
from docx import Document
|
||||
@@ -14,8 +13,6 @@ from docx.text.paragraph import Paragraph
|
||||
from docx.text.hyperlink import Hyperlink
|
||||
from docx.text.run import Run
|
||||
from lxml import etree
|
||||
from markdown_it.common.html_re import attribute
|
||||
from markdown_it.rules_block import list_block
|
||||
from pydantic import AnyUrl
|
||||
|
||||
from mineru.model.utils.docx.mammoth import body_xml
|
||||
@@ -87,23 +84,6 @@ class DocxConverter:
|
||||
self.equation_bookends: str = "<eq>{EQ}</eq>" # 公式标记格式
|
||||
Path.mkdir(self.output_path, parents=True, exist_ok=True)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
) -> bool:
|
||||
mimetype = filetype.guess_mime(file_stream).lower()
|
||||
if mimetype is None:
|
||||
logger.error(f"Failed to detect mimetype for {self.file_path}")
|
||||
extension = os.path.splitext(self.file_path)[1].lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
|
||||
@@ -9,8 +9,6 @@ def convert_path(file_path: str):
|
||||
|
||||
def convert_binary(file_binary: BinaryIO, file_path: str):
|
||||
converter = DocxConverter(file_path=file_path, output_path="./output")
|
||||
if not converter.accepts(file_binary):
|
||||
raise Exception("Not a docx file")
|
||||
return converter.convert(file_binary)
|
||||
|
||||
|
||||
|
||||
@@ -41,10 +41,8 @@ dependencies = [
|
||||
"magika>=0.6.2,<1.1.0",
|
||||
"mineru-vl-utils>=0.1.19.1,<1",
|
||||
"qwen-vl-utils>=0.0.14,<1",
|
||||
"mammoth>=1.11.0,<2",
|
||||
"defusedxml",
|
||||
"filetype",
|
||||
"markdownify",
|
||||
"defusedxml>=0.7.1,<1",
|
||||
"python-docx>=1.2.0,<2",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
Reference in New Issue
Block a user