mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 19:18:34 +07:00
Compare commits
37 Commits
release-1.
...
release-1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0222293f64 | ||
|
|
16f176ea65 | ||
|
|
1705958f65 | ||
|
|
2de5a79f52 | ||
|
|
058d318491 | ||
|
|
cfa90743b5 | ||
|
|
b36b469a1c | ||
|
|
40bfd7acce | ||
|
|
b7ff7ded64 | ||
|
|
07edefaa7d | ||
|
|
24b7e7ca36 | ||
|
|
87440ba43c | ||
|
|
ff35c75531 | ||
|
|
5ddd6799aa | ||
|
|
039f8cbfde | ||
|
|
73ccfbbfbe | ||
|
|
410d0afc81 | ||
|
|
c774a4dde1 | ||
|
|
29b47466ff | ||
|
|
a1df670e34 | ||
|
|
a67de492b1 | ||
|
|
222af4f2f5 | ||
|
|
b9eed5d865 | ||
|
|
82a4376d8a | ||
|
|
99ab04f588 | ||
|
|
67b31a78d0 | ||
|
|
4f129a64aa | ||
|
|
47d287a2a0 | ||
|
|
bc51f9f75e | ||
|
|
8caf59f7cb | ||
|
|
4df8523a31 | ||
|
|
c7a609fa7a | ||
|
|
5957cb65f9 | ||
|
|
d0ed731b9e | ||
|
|
b60166a541 | ||
|
|
ccf2ea04cb | ||
|
|
cb9c2e7616 |
@@ -48,6 +48,9 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
||||
</div>
|
||||
|
||||
# Changelog
|
||||
- 2025/04/16 1.3.4 Released
|
||||
- Slightly improved the speed of OCR detection by removing some unused blocks.
|
||||
- Fixed page-level sorting errors caused by footnotes in certain cases.
|
||||
- 2025/04/12 1.3.2 released
|
||||
- Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
|
||||
- Optimized memory usage during batch inference.
|
||||
|
||||
@@ -47,6 +47,9 @@
|
||||
</div>
|
||||
|
||||
# 更新记录
|
||||
- 2025/04/16 1.3.4 发布
|
||||
- 通过移除一些无用的块,小幅提升了ocr-det的速度
|
||||
- 修复部分情况下由footnote导致的页面内排序错误
|
||||
- 2025/04/12 1.3.2 发布
|
||||
- 修复了windows系统下,在python3.13环境安装时一些依赖包版本不兼容的问题
|
||||
- 优化批量推理时的内存占用
|
||||
|
||||
@@ -35,6 +35,7 @@ RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/m
|
||||
cp magic-pdf.template.json /root/magic-pdf.json && \
|
||||
source /opt/mineru_venv/bin/activate && \
|
||||
pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple && \
|
||||
wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
|
||||
pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
|
||||
|
||||
@@ -18,7 +18,17 @@ RUN apt-get update && \
|
||||
wget \
|
||||
git \
|
||||
libgl1 \
|
||||
libreoffice \
|
||||
fonts-noto-cjk \
|
||||
fonts-wqy-zenhei \
|
||||
fonts-wqy-microhei \
|
||||
ttf-mscorefonts-installer \
|
||||
fontconfig \
|
||||
libglib2.0-0 \
|
||||
libxrender1 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
poppler-utils \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set Python 3.10 as the default python3
|
||||
|
||||
@@ -18,7 +18,17 @@ RUN apt-get update && \
|
||||
wget \
|
||||
git \
|
||||
libgl1 \
|
||||
libreoffice \
|
||||
fonts-noto-cjk \
|
||||
fonts-wqy-zenhei \
|
||||
fonts-wqy-microhei \
|
||||
ttf-mscorefonts-installer \
|
||||
fontconfig \
|
||||
libglib2.0-0 \
|
||||
libxrender1 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
poppler-utils \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set Python 3.10 as the default python3
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "1.3.1"
|
||||
__version__ = "1.3.3"
|
||||
|
||||
@@ -147,7 +147,7 @@ def doc_analyze(
|
||||
images.append(img_dict['img'])
|
||||
page_wh_list.append((img_dict['width'], img_dict['height']))
|
||||
|
||||
images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(dataset))]
|
||||
images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))]
|
||||
|
||||
if len(images) >= MIN_BATCH_INFERENCE_SIZE:
|
||||
batch_size = MIN_BATCH_INFERENCE_SIZE
|
||||
|
||||
@@ -2,6 +2,8 @@ import time
|
||||
import torch
|
||||
from loguru import logger
|
||||
import numpy as np
|
||||
|
||||
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
|
||||
from magic_pdf.libs.clean_memory import clean_memory
|
||||
|
||||
|
||||
@@ -188,9 +190,46 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
|
||||
return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx]
|
||||
|
||||
|
||||
def remove_overlaps_min_blocks(res_list):
|
||||
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
|
||||
# 删除重叠blocks中较小的那些
|
||||
need_remove = []
|
||||
for res1 in res_list:
|
||||
for res2 in res_list:
|
||||
if res1 != res2:
|
||||
overlap_box = get_minbox_if_overlap_by_ratio(
|
||||
res1['bbox'], res2['bbox'], 0.8
|
||||
)
|
||||
if overlap_box is not None:
|
||||
res_to_remove = next(
|
||||
(res for res in res_list if res['bbox'] == overlap_box),
|
||||
None,
|
||||
)
|
||||
if (
|
||||
res_to_remove is not None
|
||||
and res_to_remove not in need_remove
|
||||
):
|
||||
large_res = res1 if res1 != res_to_remove else res2
|
||||
x1, y1, x2, y2 = large_res['bbox']
|
||||
sx1, sy1, sx2, sy2 = res_to_remove['bbox']
|
||||
x1 = min(x1, sx1)
|
||||
y1 = min(y1, sy1)
|
||||
x2 = max(x2, sx2)
|
||||
y2 = max(y2, sy2)
|
||||
large_res['bbox'] = [x1, y1, x2, y2]
|
||||
need_remove.append(res_to_remove)
|
||||
|
||||
if len(need_remove) > 0:
|
||||
for res in need_remove:
|
||||
res_list.remove(res)
|
||||
|
||||
return res_list, need_remove
|
||||
|
||||
|
||||
def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8):
|
||||
"""Extract OCR, table and other regions from layout results."""
|
||||
ocr_res_list = []
|
||||
text_res_list = []
|
||||
table_res_list = []
|
||||
table_indices = []
|
||||
single_page_mfdetrec_res = []
|
||||
@@ -204,11 +243,14 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
|
||||
"bbox": [int(res['poly'][0]), int(res['poly'][1]),
|
||||
int(res['poly'][4]), int(res['poly'][5])],
|
||||
})
|
||||
elif category_id in [0, 1, 2, 4, 6, 7]: # OCR regions
|
||||
elif category_id in [0, 2, 4, 6, 7]: # OCR regions
|
||||
ocr_res_list.append(res)
|
||||
elif category_id == 5: # Table regions
|
||||
table_res_list.append(res)
|
||||
table_indices.append(i)
|
||||
elif category_id in [1]: # Text regions
|
||||
res['bbox'] = [int(res['poly'][0]), int(res['poly'][1]), int(res['poly'][4]), int(res['poly'][5])]
|
||||
text_res_list.append(res)
|
||||
|
||||
# Process tables: merge high IoU tables first, then filter nested tables
|
||||
table_res_list, table_indices = merge_high_iou_tables(
|
||||
@@ -226,6 +268,22 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
|
||||
for idx in sorted(to_remove, reverse=True):
|
||||
del layout_res[idx]
|
||||
|
||||
# Remove overlaps in OCR and text regions
|
||||
text_res_list, need_remove = remove_overlaps_min_blocks(text_res_list)
|
||||
for res in text_res_list:
|
||||
# 将res的poly使用bbox重构
|
||||
res['poly'] = [res['bbox'][0], res['bbox'][1], res['bbox'][2], res['bbox'][1],
|
||||
res['bbox'][2], res['bbox'][3], res['bbox'][0], res['bbox'][3]]
|
||||
# 删除res的bbox
|
||||
del res['bbox']
|
||||
|
||||
ocr_res_list.extend(text_res_list)
|
||||
|
||||
if len(need_remove) > 0:
|
||||
for res in need_remove:
|
||||
del res['bbox']
|
||||
layout_res.remove(res)
|
||||
|
||||
return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res
|
||||
|
||||
|
||||
|
||||
@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
|
||||
return [[x0, y0, x1, y1]]
|
||||
|
||||
|
||||
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
||||
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks):
|
||||
page_line_list = []
|
||||
|
||||
def add_lines_to_block(b):
|
||||
@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
||||
block['real_lines'] = copy.deepcopy(block['lines'])
|
||||
add_lines_to_block(block)
|
||||
|
||||
for block in footnote_blocks:
|
||||
footnote_block = {'bbox': block[:4]}
|
||||
add_lines_to_block(footnote_block)
|
||||
|
||||
if len(page_line_list) > 200: # layoutreader最高支持512line
|
||||
return None
|
||||
|
||||
@@ -779,7 +783,7 @@ def parse_page_core(
|
||||
# interline_equation_blocks参数不够准,后面切换到interline_equations上
|
||||
interline_equation_blocks = []
|
||||
if len(interline_equation_blocks) > 0:
|
||||
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
|
||||
all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
|
||||
img_body_blocks, img_caption_blocks, img_footnote_blocks,
|
||||
table_body_blocks, table_caption_blocks, table_footnote_blocks,
|
||||
discarded_blocks,
|
||||
@@ -790,7 +794,7 @@ def parse_page_core(
|
||||
page_h,
|
||||
)
|
||||
else:
|
||||
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
|
||||
all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
|
||||
img_body_blocks, img_caption_blocks, img_footnote_blocks,
|
||||
table_body_blocks, table_caption_blocks, table_footnote_blocks,
|
||||
discarded_blocks,
|
||||
@@ -866,7 +870,7 @@ def parse_page_core(
|
||||
line_height = get_line_height(fix_blocks)
|
||||
|
||||
"""获取所有line并对line排序"""
|
||||
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
|
||||
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks)
|
||||
|
||||
"""根据line的中位数算block的序列关系"""
|
||||
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
|
||||
|
||||
@@ -99,11 +99,11 @@ def ocr_prepare_bboxes_for_layout_split_v2(
|
||||
all_discarded_blocks = []
|
||||
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
|
||||
|
||||
"""footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
|
||||
"""footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
|
||||
footnote_blocks = []
|
||||
for discarded in discarded_blocks:
|
||||
x0, y0, x1, y1 = discarded['bbox']
|
||||
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
|
||||
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
|
||||
footnote_blocks.append([x0, y0, x1, y1])
|
||||
|
||||
"""移除在footnote下面的任何框"""
|
||||
@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
|
||||
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
|
||||
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
||||
all_bboxes.sort(key=lambda x: x[0]+x[1])
|
||||
return all_bboxes, all_discarded_blocks
|
||||
return all_bboxes, all_discarded_blocks, footnote_blocks
|
||||
|
||||
|
||||
def find_blocks_under_footnote(all_bboxes, footnote_blocks):
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import os
|
||||
import subprocess
|
||||
import platform
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
|
||||
|
||||
class ConvertToPdfError(Exception):
|
||||
@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception):
|
||||
super().__init__(self.msg)
|
||||
|
||||
|
||||
# Chinese font list
|
||||
REQUIRED_CHS_FONTS = ['SimSun', 'Microsoft YaHei', 'Noto Sans CJK SC']
|
||||
|
||||
|
||||
def check_fonts_installed():
|
||||
"""Check if required Chinese fonts are installed."""
|
||||
system_type = platform.system()
|
||||
|
||||
if system_type == 'Windows':
|
||||
# Windows: check fonts via registry or system font folder
|
||||
font_dir = Path("C:/Windows/Fonts")
|
||||
installed_fonts = [f.name for f in font_dir.glob("*.ttf")]
|
||||
if any(font for font in REQUIRED_CHS_FONTS if any(font in f for f in installed_fonts)):
|
||||
return True
|
||||
raise EnvironmentError(
|
||||
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
|
||||
)
|
||||
else:
|
||||
# Linux/macOS: use fc-list
|
||||
try:
|
||||
output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
|
||||
for font in REQUIRED_CHS_FONTS:
|
||||
if font in output:
|
||||
return True
|
||||
raise EnvironmentError(
|
||||
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
|
||||
)
|
||||
except Exception as e:
|
||||
raise EnvironmentError(f"Font detection failed. Please install 'fontconfig' and fonts: {str(e)}")
|
||||
|
||||
|
||||
def get_soffice_command():
|
||||
"""Return the path to LibreOffice's soffice executable depending on the platform."""
|
||||
system_type = platform.system()
|
||||
|
||||
# First check if soffice is in PATH
|
||||
soffice_path = shutil.which('soffice')
|
||||
if soffice_path:
|
||||
return soffice_path
|
||||
|
||||
if system_type == 'Windows':
|
||||
# Check common installation paths
|
||||
possible_paths = [
|
||||
Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe',
|
||||
Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe',
|
||||
Path('C:/Program Files/LibreOffice/program/soffice.exe'),
|
||||
Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe')
|
||||
]
|
||||
|
||||
# Check other drives for windows
|
||||
for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']:
|
||||
possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe"))
|
||||
|
||||
for path in possible_paths:
|
||||
if path.exists():
|
||||
return str(path)
|
||||
|
||||
raise ConvertToPdfError(
|
||||
"LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
|
||||
"or ensure soffice.exe is in your PATH environment variable."
|
||||
)
|
||||
else:
|
||||
# For Linux/macOS, provide installation instructions if not found
|
||||
try:
|
||||
# Try to find soffice in standard locations
|
||||
possible_paths = [
|
||||
'/usr/bin/soffice',
|
||||
'/usr/local/bin/soffice',
|
||||
'/opt/libreoffice/program/soffice',
|
||||
'/Applications/LibreOffice.app/Contents/MacOS/soffice'
|
||||
]
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
raise ConvertToPdfError(
|
||||
"LibreOffice not found. Please install it:\n"
|
||||
" - Ubuntu/Debian: sudo apt-get install libreoffice\n"
|
||||
" - CentOS/RHEL: sudo yum install libreoffice\n"
|
||||
" - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n"
|
||||
" - Or ensure soffice is in your PATH environment variable."
|
||||
)
|
||||
except Exception as e:
|
||||
raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}")
|
||||
|
||||
|
||||
def convert_file_to_pdf(input_path, output_dir):
|
||||
"""Convert a single document (ppt, doc, etc.) to PDF."""
|
||||
if not os.path.isfile(input_path):
|
||||
raise FileNotFoundError(f"The input file {input_path} does not exist.")
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
|
||||
check_fonts_installed()
|
||||
|
||||
soffice_cmd = get_soffice_command()
|
||||
|
||||
cmd = [
|
||||
'soffice',
|
||||
soffice_cmd,
|
||||
'--headless',
|
||||
'--norestore',
|
||||
'--invisible',
|
||||
'--convert-to', 'pdf',
|
||||
'--outdir', str(output_dir),
|
||||
str(input_path)
|
||||
]
|
||||
|
||||
|
||||
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
|
||||
if process.returncode != 0:
|
||||
raise ConvertToPdfError(process.stderr.decode())
|
||||
raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}")
|
||||
|
||||
4
setup.py
4
setup.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
|
||||
"matplotlib>=3.10,<4",
|
||||
"ultralytics>=8.3.48,<9", # yolov8,公式检测
|
||||
"doclayout_yolo==0.0.2b1", # doclayout_yolo
|
||||
"dill>=0.3.9,<1", # doclayout_yolo
|
||||
"dill>=0.3.8,<1", # doclayout_yolo
|
||||
"rapid_table>=1.0.5,<2.0.0", # rapid_table
|
||||
"PyYAML>=6.0.2,<7", # yaml
|
||||
"ftfy>=6.3.1,<7", # unimernet_hf
|
||||
@@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||
"matplotlib>=3.10,<=3.10.1",
|
||||
"ultralytics>=8.3.48,<=8.3.104", # yolov8,公式检测
|
||||
"doclayout_yolo==0.0.2b1", # doclayout_yolo
|
||||
"dill==0.3.9", # doclayout_yolo
|
||||
"dill==0.3.8", # doclayout_yolo
|
||||
"PyYAML==6.0.2", # yaml
|
||||
"ftfy==6.3.1", # unimernet_hf
|
||||
"openai==1.71.0", # openai SDK
|
||||
|
||||
@@ -223,6 +223,22 @@
|
||||
"created_at": "2025-03-24T12:58:56Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 1982
|
||||
},
|
||||
{
|
||||
"name": "zjx20",
|
||||
"id": 2639200,
|
||||
"comment_id": 2800714918,
|
||||
"created_at": "2025-04-14T07:25:26Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 2215
|
||||
},
|
||||
{
|
||||
"name": "Doge2077",
|
||||
"id": 91442300,
|
||||
"comment_id": 2801283257,
|
||||
"created_at": "2025-04-14T10:40:54Z",
|
||||
"repoId": 765083837,
|
||||
"pullRequestNo": 2226
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -2,31 +2,34 @@ import unittest
|
||||
from PIL import Image
|
||||
from lxml import etree
|
||||
|
||||
from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
|
||||
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
||||
from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
|
||||
|
||||
|
||||
class TestppTableModel(unittest.TestCase):
|
||||
def test_image2html(self):
|
||||
img = Image.open("tests/unittest/test_table/assets/table.jpg")
|
||||
# 修改table模型路径
|
||||
config = {"device": "cuda",
|
||||
"model_dir": "/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"}
|
||||
table_model = TableMasterPaddleModel(config)
|
||||
res = table_model.img2html(img)
|
||||
img = Image.open("assets/table.jpg")
|
||||
atom_model_manager = AtomModelSingleton()
|
||||
ocr_engine = atom_model_manager.get_atom_model(
|
||||
atom_model_name='ocr',
|
||||
ocr_show_log=False,
|
||||
det_db_box_thresh=0.5,
|
||||
det_db_unclip_ratio=1.6,
|
||||
lang='ch'
|
||||
)
|
||||
table_model = RapidTableModel(ocr_engine, 'slanet_plus')
|
||||
html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(img)
|
||||
# 验证生成的 HTML 是否符合预期
|
||||
parser = etree.HTMLParser()
|
||||
tree = etree.fromstring(res, parser)
|
||||
tree = etree.fromstring(html_code, parser)
|
||||
|
||||
# 检查 HTML 结构
|
||||
assert tree.find('.//table') is not None, "HTML should contain a <table> element"
|
||||
assert tree.find('.//thead') is not None, "HTML should contain a <thead> element"
|
||||
assert tree.find('.//tbody') is not None, "HTML should contain a <tbody> element"
|
||||
assert tree.find('.//tr') is not None, "HTML should contain a <tr> element"
|
||||
assert tree.find('.//td') is not None, "HTML should contain a <td> element"
|
||||
|
||||
# 检查具体的表格内容
|
||||
headers = tree.xpath('//thead/tr/td/b')
|
||||
print(headers) # Print headers for debugging
|
||||
headers = tree.xpath('//table/tr[1]/td')
|
||||
assert len(headers) == 5, "Thead should have 5 columns"
|
||||
assert headers[0].text and headers[0].text.strip() == "Methods", "First header should be 'Methods'"
|
||||
assert headers[1].text and headers[1].text.strip() == "R", "Second header should be 'R'"
|
||||
@@ -35,7 +38,7 @@ class TestppTableModel(unittest.TestCase):
|
||||
assert headers[4].text and headers[4].text.strip() == "FPS", "Fifth header should be 'FPS'"
|
||||
|
||||
# 检查第一行数据
|
||||
first_row = tree.xpath('//tbody/tr[1]/td')
|
||||
first_row = tree.xpath('//table/tr[2]/td')
|
||||
assert len(first_row) == 5, "First row should have 5 cells"
|
||||
assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'"
|
||||
assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'"
|
||||
@@ -44,14 +47,13 @@ class TestppTableModel(unittest.TestCase):
|
||||
assert first_row[4].text and first_row[4].text.strip() == "8.9", "Fifth cell should be '8.9'"
|
||||
|
||||
# 检查倒数第二行数据
|
||||
second_last_row = tree.xpath('//tbody/tr[position()=last()-1]/td')
|
||||
second_last_row = tree.xpath('//table/tr[position()=last()-1]/td')
|
||||
assert len(second_last_row) == 5, "second_last_row should have 5 cells"
|
||||
assert second_last_row[0].text and second_last_row[
|
||||
0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
|
||||
assert second_last_row[0].text and second_last_row[0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
|
||||
assert second_last_row[1].text and second_last_row[1].text.strip() == "80.68", "Second cell should be '80.68'"
|
||||
assert second_last_row[2].text and second_last_row[2].text.strip() == "85.40", "Third cell should be '85.40'"
|
||||
assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
|
||||
assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
|
||||
# assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
|
||||
# assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Reference in New Issue
Block a user