mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-04-12 07:06:44 +07:00
Compare commits
8 Commits
magic_pdf-
...
release-1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
af53a46311 | ||
|
|
2e5e55cfe2 | ||
|
|
658e6bc768 | ||
|
|
4641264e12 | ||
|
|
4bd3381c92 | ||
|
|
f5a56bf157 | ||
|
|
78d11172e3 | ||
|
|
a2b07bfde4 |
@@ -36,7 +36,7 @@ RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/m
|
||||
source /opt/mineru_venv/bin/activate && \
|
||||
pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip3 install -U magic-pdf[full] 'numpy<2' decorator attrs absl-py cloudpickle ml-dtypes tornado einops -i https://mirrors.aliyun.com/pypi/simple && \
|
||||
wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
|
||||
pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "1.3.3"
|
||||
__version__ = "1.3.4"
|
||||
|
||||
@@ -4,6 +4,8 @@ import platform
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ConvertToPdfError(Exception):
|
||||
def __init__(self, msg):
|
||||
@@ -11,35 +13,24 @@ class ConvertToPdfError(Exception):
|
||||
super().__init__(self.msg)
|
||||
|
||||
|
||||
# Chinese font list
|
||||
REQUIRED_CHS_FONTS = ['SimSun', 'Microsoft YaHei', 'Noto Sans CJK SC']
|
||||
|
||||
|
||||
def check_fonts_installed():
|
||||
"""Check if required Chinese fonts are installed."""
|
||||
system_type = platform.system()
|
||||
|
||||
if system_type == 'Windows':
|
||||
# Windows: check fonts via registry or system font folder
|
||||
font_dir = Path("C:/Windows/Fonts")
|
||||
installed_fonts = [f.name for f in font_dir.glob("*.ttf")]
|
||||
if any(font for font in REQUIRED_CHS_FONTS if any(font in f for f in installed_fonts)):
|
||||
return True
|
||||
raise EnvironmentError(
|
||||
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
|
||||
)
|
||||
if system_type in ['Windows', 'Darwin']:
|
||||
pass
|
||||
else:
|
||||
# Linux/macOS: use fc-list
|
||||
# Linux: use fc-list
|
||||
try:
|
||||
output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
|
||||
for font in REQUIRED_CHS_FONTS:
|
||||
if font in output:
|
||||
return True
|
||||
raise EnvironmentError(
|
||||
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
|
||||
)
|
||||
except Exception as e:
|
||||
raise EnvironmentError(f"Font detection failed. Please install 'fontconfig' and fonts: {str(e)}")
|
||||
if output.strip(): # 只要有任何输出(非空)
|
||||
return True
|
||||
else:
|
||||
logger.warning(
|
||||
f"No Chinese fonts were detected, the converted document may not display Chinese content properly."
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def get_soffice_command():
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import unittest
|
||||
import os
|
||||
from PIL import Image
|
||||
from lxml import etree
|
||||
|
||||
@@ -8,7 +9,7 @@ from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableM
|
||||
|
||||
class TestppTableModel(unittest.TestCase):
|
||||
def test_image2html(self):
|
||||
img = Image.open("assets/table.jpg")
|
||||
img = Image.open(os.path.join(os.path.dirname(__file__), "assets/table.jpg"))
|
||||
atom_model_manager = AtomModelSingleton()
|
||||
ocr_engine = atom_model_manager.get_atom_model(
|
||||
atom_model_name='ocr',
|
||||
|
||||
Reference in New Issue
Block a user