Compare commits

...

80 Commits

Author SHA1 Message Date
Xiaomeng Zhao
3124678a20 Merge pull request #4468 from myhloli/add_docx
Add docx
2026-02-02 11:23:43 +08:00
Xiaomeng Zhao
ac2099a332 Merge branch 'opendatalab:add_docx' into add_docx 2026-02-02 11:23:10 +08:00
Xiaomeng Zhao
7448029a9d Merge pull request #4448 from Sidney233/add_pptx
Add pptx
2026-01-30 21:47:13 +08:00
Sidney233
e92dfe5c6b feat: 添加解析pptx 2026-01-30 17:07:02 +08:00
Sidney233
8d14f64369 feat: 添加解析pptx 2026-01-29 17:24:28 +08:00
Xiaomeng Zhao
6aae690998 Merge branch 'opendatalab:add_docx' into add_docx 2026-01-28 18:25:22 +08:00
Xiaomeng Zhao
9393a62d94 Merge pull request #4432 from Sidney233/add_docx
fix: sdt标签中包含drawling元素,导致跳过整个sdt标签的解析
2026-01-28 18:20:32 +08:00
Sidney233
25e48960c7 Merge branch 'opendatalab:add_docx' into add_docx 2026-01-28 17:46:25 +08:00
Sidney233
52874072e4 fix: sdt标签中包含drawling元素,导致跳过整个sdt标签的解析 2026-01-28 17:45:04 +08:00
Sidney233
00b77122df work review 2026-01-28 17:30:55 +08:00
Xiaomeng Zhao
6fc0596ce9 Merge branch 'opendatalab:add_docx' into add_docx 2026-01-28 17:17:00 +08:00
Xiaomeng Zhao
32c08d1418 Merge pull request #4430 from Sidney233/add_docx
feat: 添加目录解析
2026-01-28 17:14:34 +08:00
Sidney233
e20523018a feat: 添加目录解析 2026-01-28 17:07:57 +08:00
Sidney233
c05e77fcdd work review 2026-01-28 17:01:23 +08:00
Xiaomeng Zhao
72f26156f0 Merge branch 'opendatalab:add_docx' into add_docx 2026-01-26 21:26:55 +08:00
Xiaomeng Zhao
29751a415f Merge pull request #4417 from Sidney233/add_docx
fix: 因为可能出现重复id导致丢失部分文本框
2026-01-26 16:47:59 +08:00
Sidney233
3743463438 fix: 因为可能出现重复id导致丢失部分文本框 2026-01-26 16:44:50 +08:00
myhloli
6725648080 feat: enhance DOCX processing to classify caption blocks and clean table HTML 2026-01-26 15:21:48 +08:00
Xiaomeng Zhao
efb0421fe8 Merge pull request #4403 from Sidney233/add_docx
feat: 添加识别新旧文本框的方法
2026-01-23 15:32:54 +08:00
Xiaomeng Zhao
777307bad9 Merge branch 'add_docx' into add_docx 2026-01-23 15:32:42 +08:00
Xiaomeng Zhao
5ee10f0306 Merge pull request #4404 from myhloli/add_docx
Add docx
2026-01-23 15:28:35 +08:00
Sidney233
8611da6a4b Merge remote-tracking branch 'origin/add_docx' into add_docx 2026-01-23 14:55:24 +08:00
Sidney233
775aafb033 feat: 添加识别新旧文本框的方法 2026-01-23 14:54:00 +08:00
myhloli
78e83d00a7 feat: refine DOCX processing to adjust title level handling and streamline list item references 2026-01-21 18:48:40 +08:00
myhloli
7bcd3afb86 feat: refactor DOCX processing to unify block type handling for captions and list items 2026-01-21 17:21:34 +08:00
myhloli
517257e058 feat: enhance DOCX text processing to include equations and hyperlinks in content generation 2026-01-21 17:04:22 +08:00
myhloli
ee4065ffd5 feat: refactor line object construction in DOCX processing to streamline block handling 2026-01-21 11:08:19 +08:00
myhloli
5026faa458 feat: update DOCX processing to change test document path and enhance chart handling with improved content structure 2026-01-19 20:18:24 +08:00
myhloli
8381e61f0c feat: update DOCX processing to change test document path and enhance chart handling with improved content structure 2026-01-19 18:37:56 +08:00
Xiaomeng Zhao
a8c4b6c2fe Merge pull request #4388 from myhloli/add_docx
Add docx
2026-01-19 17:02:34 +08:00
Xiaomeng Zhao
a0b0eb704c Merge branch 'opendatalab:add_docx' into add_docx 2026-01-19 17:01:40 +08:00
Xiaomeng Zhao
1ed570a205 Merge pull request #4386 from Sidney233/add_docx
feat: 新增处理内嵌excel表格
2026-01-19 17:00:58 +08:00
Sidney233
21ebf6bdb1 Merge branch 'opendatalab:add_docx' into add_docx 2026-01-19 16:28:13 +08:00
Sidney233
11513dd44c feat: 添加图表处理excel表格 2026-01-19 16:26:27 +08:00
myhloli
32592cd27f feat: enhance DOCX header and footer processing by adding deduplication for inline equations and hyperlinks 2026-01-16 17:16:51 +08:00
myhloli
9137f84591 feat: enhance DOCX header and footer processing by adding deduplication for inline equations and hyperlinks 2026-01-16 16:14:04 +08:00
myhloli
56c3bb3570 feat: enhance DOCX processing by adding hyperlink support and improving text formatting with equations 2026-01-16 16:04:45 +08:00
myhloli
23e3a73f33 feat: enhance DOCX processing by adding hyperlink support and improving text formatting with equations 2026-01-16 15:22:30 +08:00
myhloli
e7c67a95b6 feat: enhance DOCX processing by adding main execution block and improving text content handling with equations 2026-01-15 20:01:22 +08:00
myhloli
ea6bb2ede9 feat: update result_to_middle_json function to streamline parameters and enhance JSON output structure 2026-01-14 19:50:38 +08:00
myhloli
810717b42a feat: refactor DOCX processing by consolidating image handling and introducing MagicModel for block management 2026-01-14 16:56:49 +08:00
Xiaomeng Zhao
7554127ff7 Merge pull request #4367 from Sidney233/add_docx
修复list_item重复统计,过滤掉内容为空的block,页眉页脚的纯数字内容不添加进page,caption识别
2026-01-14 16:47:31 +08:00
Sidney233
d629ede38a 修复list_item重复统计,过滤掉内容为空的block,页眉页脚的纯数字内容不添加进page,caption识别 2026-01-14 16:04:47 +08:00
Xiaomeng Zhao
6501ad878d Merge pull request #4353 from myhloli/add_docx
Add docx
2026-01-13 15:15:15 +08:00
myhloli
6c8fa9776f feat: simplify element handling in DOCX processing by removing unnecessary references and improving structure 2026-01-13 15:13:38 +08:00
Xiaomeng Zhao
1d93aa8ab9 Merge branch 'opendatalab:add_docx' into add_docx 2026-01-12 19:08:34 +08:00
Xiaomeng Zhao
9aba297545 Merge pull request #4349 from Sidney233/add_docx
feat: section division and header and footer processing
2026-01-12 19:08:01 +08:00
myhloli
dec84a9b5a feat: disable model output dumping in DOCX processing for improved performance 2026-01-12 18:33:13 +08:00
Sidney233
cbe39f4a5a Merge branch 'opendatalab:add_docx' into add_docx 2026-01-12 17:10:51 +08:00
Sidney233
e042384953 分节与页眉页脚处理完成 2026-01-12 17:08:37 +08:00
Sidney233
a644a8a074 分节处理问题的TODO 2026-01-09 17:39:16 +08:00
myhloli
07db6839b8 feat: refactor logging to use loguru and add BlockType and ContentBlock classes for structured content handling 2026-01-07 19:24:18 +08:00
myhloli
17394682e2 feat: add lxml dependency for enhanced XML processing in DOCX handling 2026-01-07 14:22:57 +08:00
Xiaomeng Zhao
97bd2a2b94 Merge pull request #4310 from myhloli/add_docx
Add docx
2026-01-07 10:38:52 +08:00
myhloli
ad175df3d2 feat: enhance DOCX processing by refining image handling and improving logging for inference timing 2026-01-06 20:04:06 +08:00
myhloli
0cbe965d97 feat: enhance DOCX processing by adding support for office file types and refactoring related functions 2026-01-06 19:49:40 +08:00
Xiaomeng Zhao
74f6d4d0e7 Merge pull request #4309 from myhloli/add_docx
feat: enhance OMML processing with additional LaTeX functions and imp…
2026-01-06 17:19:32 +08:00
myhloli
648fb1f7cf feat: update array formatting in latex_dict.py for improved LaTeX output 2026-01-06 17:19:01 +08:00
myhloli
b6fc07cf9e feat: replace logging with loguru for improved logging functionality in OMML processing 2026-01-06 17:11:06 +08:00
myhloli
57be6926a9 feat: enhance OMML processing with additional LaTeX functions and improve unicode handling 2026-01-06 16:22:05 +08:00
Xiaomeng Zhao
7abcfa39a0 Merge pull request #4308 from myhloli/add_docx
feat: refactor DOCX utilities and update dependencies for improved pr…
2026-01-06 16:17:19 +08:00
myhloli
6f76664141 feat: refactor DOCX utilities and update dependencies for improved processing 2026-01-06 16:15:44 +08:00
Xiaomeng Zhao
23bc263b85 Merge pull request #4299 from myhloli/add_docx
Add docx
2026-01-06 11:27:20 +08:00
myhloli
53fb1cd055 feat: implement DOCX processing in the converter 2026-01-06 11:26:18 +08:00
myhloli
f0ce905c7d fix: update pdfminer.six version to 20251230 in pyproject.toml 2026-01-05 19:08:25 +08:00
myhloli
df33d483de Merge remote-tracking branch 'origin/add_docx' into add_docx 2026-01-05 17:44:38 +08:00
myhloli
f44fb174ea feat: add support for DOCX file format in converter 2026-01-05 17:44:17 +08:00
Xiaomeng Zhao
70b1e73606 Merge pull request #4293 from Sidney233/docx-dev
fix
2026-01-05 17:43:10 +08:00
Sidney233
11fb0a0199 fix: add util files 2026-01-05 17:40:37 +08:00
Xiaomeng Zhao
66f8f0e93a Merge pull request #4292 from Sidney233/docx-dev
Docx dev
2026-01-05 14:10:21 +08:00
Xiaomeng Zhao
942c1693c7 Merge branch 'add_docx' into docx-dev 2026-01-05 14:10:04 +08:00
Sidney233
7387797b17 work review 2026-01-05 13:33:55 +08:00
Sidney233
8a3eb268a9 work review 2025-12-31 20:05:20 +08:00
Sidney233
fcaa34f466 work review 2025-12-26 20:18:17 +08:00
Sidney233
f111a97d5d work review 2025-12-23 15:32:26 +08:00
Sidney233
6d8106685e work review 2025-12-12 19:22:04 +08:00
Sidney233
4e29979d89 feat: add word text and pic 2025-12-08 15:17:00 +08:00
Sidney233
569baff305 feat: add word text and pic 2025-12-08 15:14:49 +08:00
Sidney233
09920c6391 work review 2025-12-05 22:39:49 +08:00
Sidney233
bc02653d63 work review 2025-11-21 16:44:28 +08:00
81 changed files with 8230 additions and 37 deletions

View File

@@ -6,7 +6,8 @@ from pathlib import Path
from loguru import logger
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn, pptx_suffixes, \
xlsx_suffixes, pdf_suffixes, image_suffixes, office_suffixes, docx_suffixes
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.engine_utils import get_vlm_engine
@@ -17,7 +18,9 @@ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
from mineru.backend.office.office_middle_json_mkcontent import union_make as office_union_make
from mineru.backend.office.docx_analyze import office_docx_analyze
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path, guess_suffix_by_bytes
def do_parse(
@@ -41,6 +44,24 @@ def do_parse(
start_page_id=0, # Start page ID for parsing, default is 0
end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
):
need_remove_index = _process_office_doc(
output_dir,
pdf_file_names=pdf_file_names,
pdf_bytes_list=pdf_bytes_list,
f_dump_md=f_dump_md,
f_dump_middle_json=f_dump_middle_json,
f_dump_model_output=f_dump_model_output,
f_dump_orig_file=f_dump_orig_pdf,
f_dump_content_list=f_dump_content_list,
f_make_md_mode=f_make_md_mode,
)
for index in sorted(need_remove_index, reverse=True):
del pdf_bytes_list[index]
del pdf_file_names[index]
del p_lang_list[index]
if not pdf_bytes_list:
logger.warning("No valid PDF or image files to process.")
return
if backend == "pipeline":
for idx, pdf_bytes in enumerate(pdf_bytes_list):
@@ -68,7 +89,7 @@ def do_parse(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, model_json, is_pipeline=True
f_make_md_mode, middle_json, model_json, process_mode="pipeline"
)
else:
f_draw_span_bbox = False
@@ -93,7 +114,7 @@ def do_parse(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, is_pipeline=False
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
elif backend.startswith("hybrid-"):
backend = backend[7:]
@@ -123,9 +144,56 @@ def do_parse(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, is_pipeline=False
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
def _process_office_doc(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_output=True,
f_dump_orig_file=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
):
need_remove_index = []
for i, file_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[i]
file_suffix = guess_suffix_by_bytes(file_bytes)
if file_suffix in docx_suffixes:
need_remove_index.append(i)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"office")
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = office_docx_analyze(
file_bytes,
image_writer=image_writer,
)
f_draw_layout_bbox = False
f_draw_span_bbox = False
pdf_info = middle_json["pdf_info"]
_process_output(
pdf_info, file_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_file,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, process_mode="docx"
)
elif file_suffix in pptx_suffixes:
need_remove_index.append(i)
logger.warning(f"Currently, PPTX files are not supported: {pdf_file_name}")
elif file_suffix in xlsx_suffixes:
need_remove_index.append(i)
logger.warning(f"Currently, XLSX files are not supported: {pdf_file_name}")
return need_remove_index
def _process_output(
pdf_info,
pdf_bytes,
@@ -143,8 +211,18 @@ def _process_output(
f_make_md_mode,
middle_json,
model_output=None,
is_pipeline=True
process_mode="vlm"
):
if process_mode == "pipeline":
make_func = pipeline_union_make
elif process_mode == "vlm":
make_func = vlm_union_make
elif process_mode in office_suffixes:
make_func = office_union_make
else:
raise Exception(f"Unknown process_mode: {process_mode}")
"""处理输出文件"""
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
@@ -153,15 +231,20 @@ def _process_output(
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if process_mode in ["pipeline", "vlm"]:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
elif process_mode in office_suffixes:
md_writer.write(
f"{pdf_file_name}_origin.{process_mode}",
pdf_bytes,
)
image_dir = str(os.path.basename(local_image_dir))
if f_dump_md:
make_func = pipeline_union_make if is_pipeline else vlm_union_make
md_content_str = make_func(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
@@ -169,13 +252,19 @@ def _process_output(
)
if f_dump_content_list:
make_func = pipeline_union_make if is_pipeline else vlm_union_make
content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if process_mode != "pipeline":
content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list_v2.json",
json.dumps(content_list_v2, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
@@ -254,14 +343,12 @@ def parse_doc(
if __name__ == '__main__':
# args
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
pdf_files_dir = os.path.join(__dir__, "docx")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = ["pdf"]
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob('*'):
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes + office_suffixes:
doc_path_list.append(doc_path)
"""如果您由于网络问题无法下载模型可以设置环境变量MINERU_MODEL_SOURCE为modelscope使用免代理仓库下载模型"""

View File

@@ -0,0 +1 @@
# Copyright (c) Opendatalab. All rights reserved.

View File

@@ -0,0 +1,43 @@
# Copyright (c) Opendatalab. All rights reserved.
import time
from io import BytesIO
from loguru import logger
from mineru.backend.office.model_output_to_middle_json import result_to_middle_json
from mineru.model.docx.main import convert_binary
def office_docx_analyze(
file_bytes,
image_writer=None
):
infer_start = time.time()
file_stream = BytesIO(file_bytes)
results = convert_binary(file_stream)
infer_time = round(time.time() - infer_start, 2)
safe_time = max(infer_time, 0.01)
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results) / safe_time, 3)} page/s")
middle_json = result_to_middle_json(
results,
image_writer,
)
return middle_json, results
if __name__ == '__main__':
docx_path = "/Users/myhloli/projects/20240809magic_pdf/Magic-PDF/mineru/model/docx/test.docx"
from mineru.data.data_reader_writer import FileBasedDataWriter
with open(docx_path, 'rb') as f:
file_bytes = f.read()
image_writer = FileBasedDataWriter("./output_images")
middle_json, results = office_docx_analyze(
file_bytes,
image_writer=image_writer,
)
import json
print(json.dumps(middle_json, indent=2, ensure_ascii=False))

View File

@@ -0,0 +1,48 @@
from loguru import logger
from mineru.backend.office.office_magic_model import MagicModel
from mineru.version import __version__
def blocks_to_page_info(page_blocks, image_writer, page_index) -> dict:
"""将blocks转换为页面信息"""
magic_model = MagicModel(page_blocks)
image_blocks = magic_model.get_image_blocks()
table_blocks = magic_model.get_table_blocks()
title_blocks = magic_model.get_title_blocks()
discarded_blocks = magic_model.get_discarded_blocks()
code_blocks = magic_model.get_code_blocks()
ref_text_blocks = magic_model.get_ref_text_blocks()
phonetic_blocks = magic_model.get_phonetic_blocks()
list_blocks = magic_model.get_list_blocks()
text_blocks = magic_model.get_text_blocks()
interline_equation_blocks = magic_model.get_interline_equation_blocks()
page_blocks = []
page_blocks.extend([
*image_blocks,
*table_blocks,
*code_blocks,
*ref_text_blocks,
*phonetic_blocks,
*title_blocks,
*text_blocks,
*interline_equation_blocks,
*list_blocks,
])
# 对page_blocks根据index的值进行排序
page_blocks.sort(key=lambda x: x["index"])
page_info = {"para_blocks": page_blocks, "discarded_blocks": discarded_blocks, "page_idx": page_index}
return page_info
def result_to_middle_json(model_output_blocks_list, image_writer):
middle_json = {"pdf_info": [], "_backend":"office", "_version_name": __version__}
for index, page_blocks in enumerate(model_output_blocks_list):
page_info = blocks_to_page_info(page_blocks, image_writer, index)
middle_json["pdf_info"].append(page_info)
return middle_json

View File

@@ -0,0 +1,639 @@
import re
from typing import Literal
from loguru import logger
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from mineru.utils.enum_class import ContentType, BlockType
from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
class MagicModel:
def __init__(self, page_blocks: list):
self.page_blocks = page_blocks
blocks = []
self.all_spans = []
# 对caption块进行分类将其分类为image_caption或table_caption
page_blocks = classify_caption_blocks(page_blocks)
# 解析每个块
for index, block_info in enumerate(page_blocks):
block_type = block_info["type"]
span_type = "unknown"
if block_type in [
"text",
"title",
"image_caption",
"table_caption",
"header",
"footer",
]:
span_type = ContentType.TEXT
elif block_type in ["image"]:
block_type = BlockType.IMAGE_BODY
span_type = ContentType.IMAGE
elif block_type in ["table"]:
block_type = BlockType.TABLE_BODY
span_type = ContentType.TABLE
elif block_type in ["equation"]:
block_type = BlockType.INTERLINE_EQUATION
span_type = ContentType.INTERLINE_EQUATION
if span_type in ["image", "table"]:
span = {
"type": span_type,
}
if span_type == ContentType.TABLE:
span["html"] = clean_table_html(block_info.get("content", ""))
elif span_type == ContentType.IMAGE:
# jpg格式base64
span["image_base64"] = block_info.get("content", "")
elif span_type in [ContentType.INTERLINE_EQUATION]:
span = {
"type": span_type,
"content": block_info.get("content", ""),
}
else:
if block_content:
block_content = clean_content(block_content)
if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
switch_code_to_algorithm = True
# 生成包含文本和公式的span列表
spans = []
last_end = 0
# 查找所有公式
for match in re.finditer(r'\\\((.+?)\\\)', block_content):
start, end = match.span()
# 添加公式前的文本
if start > last_end:
text_before = block_content[last_end:start]
if text_before.strip():
spans.append({
"bbox": block_bbox,
"type": ContentType.TEXT,
"content": text_before
})
# 添加公式(去除\(和\)
formula = match.group(1)
spans.append({
"bbox": block_bbox,
"type": ContentType.INLINE_EQUATION,
"content": formula.strip()
})
last_end = end
# 添加最后一个公式后的文本
if last_end < len(block_content):
text_after = block_content[last_end:]
if text_after.strip():
spans.append({
"bbox": block_bbox,
"type": ContentType.TEXT,
"content": text_after
})
span = spans
else:
span = {
"bbox": block_bbox,
"type": span_type,
"content": block_content,
}
# 处理span类型并添加到all_spans
if isinstance(span, dict) and "bbox" in span:
self.all_spans.append(span)
spans = [span]
elif isinstance(span, list):
self.all_spans.extend(span)
spans = span
else:
raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
blocks.append(
{
"bbox": block_bbox,
"type": block_type,
"lines": [line],
"index": index,
}
)
self.image_blocks = []
self.table_blocks = []
self.interline_equation_blocks = []
self.text_blocks = []
self.title_blocks = []
self.code_blocks = []
self.discarded_blocks = []
self.ref_text_blocks = []
self.phonetic_blocks = []
self.list_blocks = []
for block in blocks:
if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
self.image_blocks.append(block)
elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
self.table_blocks.append(block)
elif block["type"] in [BlockType.CODE_BODY, BlockType.CODE_CAPTION]:
self.code_blocks.append(block)
elif block["type"] == BlockType.INTERLINE_EQUATION:
self.interline_equation_blocks.append(block)
elif block["type"] == BlockType.TEXT:
self.text_blocks.append(block)
elif block["type"] == BlockType.TITLE:
self.title_blocks.append(block)
elif block["type"] in [BlockType.REF_TEXT]:
self.ref_text_blocks.append(block)
elif block["type"] in [BlockType.PHONETIC]:
self.phonetic_blocks.append(block)
elif block["type"] in [BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
self.discarded_blocks.append(block)
elif block["type"] == BlockType.LIST:
self.list_blocks.append(block)
else:
continue
self.list_blocks, self.text_blocks, self.ref_text_blocks = fix_list_blocks(self.list_blocks, self.text_blocks, self.ref_text_blocks)
self.image_blocks, not_include_image_blocks = fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
self.table_blocks, not_include_table_blocks = fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
self.code_blocks, not_include_code_blocks = fix_two_layer_blocks(self.code_blocks, BlockType.CODE)
for code_block in self.code_blocks:
for block in code_block['blocks']:
if block['type'] == BlockType.CODE_BODY:
if len(block["lines"]) > 0:
line = block["lines"][0]
code_block["sub_type"] = line["extra"]["type"]
if code_block["sub_type"] in ["code"]:
code_block["guess_lang"] = line["extra"]["guess_lang"]
del line["extra"]
else:
code_block["sub_type"] = "code"
code_block["guess_lang"] = "txt"
for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
block["type"] = BlockType.TEXT
self.text_blocks.append(block)
def get_list_blocks(self):
return self.list_blocks
def get_image_blocks(self):
return self.image_blocks
def get_table_blocks(self):
return self.table_blocks
def get_code_blocks(self):
return self.code_blocks
def get_ref_text_blocks(self):
return self.ref_text_blocks
def get_phonetic_blocks(self):
return self.phonetic_blocks
def get_title_blocks(self):
return self.title_blocks
def get_text_blocks(self):
return self.text_blocks
def get_interline_equation_blocks(self):
return self.interline_equation_blocks
def get_discarded_blocks(self):
return self.discarded_blocks
def get_all_spans(self):
return self.all_spans
def clean_table_html(html: str) -> str:
"""
清洗表格HTML只保留对表格结构表示有用的信息。
保留的属性:
- colspan: 列合并
- rowspan: 行合并
清洗的内容:
- 移除所有style属性
- 移除所有class属性
- 移除border等其他属性
- 保持表格结构标签table, thead, tbody, tr, th, td等
Args:
html: 原始表格HTML字符串
Returns:
清洗后的HTML字符串
"""
if not html:
return ""
# 需要保留的属性(对表格结构有用)
preserved_attrs = {'colspan', 'rowspan'}
def clean_tag(match):
"""清洗单个标签,只保留结构相关的属性"""
full_tag = match.group(0)
tag_name = match.group(1).lower()
# 自闭合标签的处理
is_self_closing = full_tag.rstrip().endswith('/>')
# 提取需要保留的属性
kept_attrs = []
# 匹配所有属性: attr="value" 或 attr='value' 或 attr=value 或单独的attr
attr_pattern = r'(\w+)\s*=\s*(?:"([^"]*)"|\'([^\']*)\'|(\S+))|(\w+)(?=\s|>|/>)'
for attr_match in re.finditer(attr_pattern, full_tag):
if attr_match.group(5):
# 单独的属性(如 disabled跳过
continue
attr_name = attr_match.group(1)
if attr_name is None:
continue
attr_name = attr_name.lower()
attr_value = attr_match.group(2) or attr_match.group(3) or attr_match.group(4) or ""
# 只保留colspan和rowspan
if attr_name in preserved_attrs:
kept_attrs.append(f'{attr_name}="{attr_value}"')
# 重建标签
if kept_attrs:
attrs_str = ' ' + ' '.join(kept_attrs)
else:
attrs_str = ''
if is_self_closing:
return f'<{tag_name}{attrs_str}/>'
else:
return f'<{tag_name}{attrs_str}>'
# 匹配开始标签(包括自闭合标签),捕获标签名
# 匹配 <tagname ...> 或 <tagname .../>
tag_pattern = r'<(\w+)(?:\s+[^>]*)?\s*/?>'
result = re.sub(tag_pattern, clean_tag, html)
return result
def isolated_formula_clean(txt):
latex = txt[:]
if latex.startswith("\\["): latex = latex[2:]
if latex.endswith("\\]"): latex = latex[:-2]
latex = latex.strip()
return latex
def code_content_clean(content):
"""清理代码内容移除Markdown代码块的开始和结束标记"""
if not content:
return ""
lines = content.splitlines()
start_idx = 0
end_idx = len(lines)
# 处理开头的三个反引号
if lines and lines[0].startswith("```"):
start_idx = 1
# 处理结尾的三个反引号
if lines and end_idx > start_idx and lines[end_idx - 1].strip() == "```":
end_idx -= 1
# 只有在有内容时才进行join操作
if start_idx < end_idx:
return "\n".join(lines[start_idx:end_idx]).strip()
return ""
def clean_content(content):
if content and content.count("\\[") == content.count("\\]") and content.count("\\[") > 0:
# Function to handle each match
def replace_pattern(match):
# Extract content between \[ and \]
inner_content = match.group(1)
return f"[{inner_content}]"
# Find all patterns of \[x\] and apply replacement
pattern = r'\\\[(.*?)\\\]'
content = re.sub(pattern, replace_pattern, content)
return content
def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
"""基于index的主客体关联包装函数"""
# 定义获取主体和客体对象的函数
def get_subjects():
return reduct_overlap(
list(
map(
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
filter(
lambda x: x["type"] == subject_block_type,
blocks,
),
)
)
)
def get_objects():
return reduct_overlap(
list(
map(
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
filter(
lambda x: x["type"] == object_block_type,
blocks,
),
)
)
)
# 调用通用方法
return tie_up_category_by_index(
get_subjects,
get_objects
)
def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
with_captions = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_caption")
with_footnotes = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_footnote")
ret = []
for v in with_captions:
record = {
f"{block_type}_body": v["sub_bbox"],
f"{block_type}_caption_list": v["obj_bboxes"],
}
filter_idx = v["sub_idx"]
d = next(filter(lambda x: x["sub_idx"] == filter_idx, with_footnotes))
record[f"{block_type}_footnote_list"] = d["obj_bboxes"]
ret.append(record)
return ret
def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
need_fix_blocks = get_type_blocks(blocks, fix_type)
fixed_blocks = []
not_include_blocks = []
processed_indices = set()
# 特殊处理表格类型,确保标题在表格前,注脚在表格后
if fix_type in ["table", "image"]:
# 收集所有不合适的caption和footnote
misplaced_captions = [] # 存储(caption, 原始block索引)
misplaced_footnotes = [] # 存储(footnote, 原始block索引)
# 第一步移除不符合位置要求的footnote
for block_idx, block in enumerate(need_fix_blocks):
body = block[f"{fix_type}_body"]
body_index = body["index"]
# 检查footnote应在body后或同位置
valid_footnotes = []
for footnote in block[f"{fix_type}_footnote_list"]:
if footnote["index"] >= body_index:
valid_footnotes.append(footnote)
else:
misplaced_footnotes.append((footnote, block_idx))
block[f"{fix_type}_footnote_list"] = valid_footnotes
# 第三步重新分配不合规的footnote到合适的body
for footnote, original_block_idx in misplaced_footnotes:
footnote_index = footnote["index"]
best_block_idx = None
min_distance = float('inf')
# 寻找索引小于等于footnote_index的最近body
for idx, block in enumerate(need_fix_blocks):
body_index = block[f"{fix_type}_body"]["index"]
if body_index <= footnote_index and idx != original_block_idx:
distance = footnote_index - body_index
if distance < min_distance:
min_distance = distance
best_block_idx = idx
if best_block_idx is not None:
# 找到合适的body添加到对应block的footnote_list
need_fix_blocks[best_block_idx][f"{fix_type}_footnote_list"].append(footnote)
else:
# 没找到合适的body作为普通block处理
not_include_blocks.append(footnote)
# 第四步:将每个block的caption_list和footnote_list中不连续index的元素提出来作为普通block处理
for block in need_fix_blocks:
caption_list = block[f"{fix_type}_caption_list"]
footnote_list = block[f"{fix_type}_footnote_list"]
body_index = block[f"{fix_type}_body"]["index"]
# 处理caption_list (从body往前看,caption在body之前)
if caption_list:
# 按index降序排列,从最接近body的开始检查
caption_list.sort(key=lambda x: x["index"], reverse=True)
filtered_captions = [caption_list[0]]
for i in range(1, len(caption_list)):
prev_index = caption_list[i - 1]["index"]
curr_index = caption_list[i]["index"]
# 检查是否连续
if curr_index == prev_index - 1:
filtered_captions.append(caption_list[i])
else:
# 检查gap中是否只有body_index
gap_indices = set(range(curr_index + 1, prev_index))
if gap_indices == {body_index}:
# gap中只有body_index,不算真正的gap
filtered_captions.append(caption_list[i])
else:
# 出现真正的gap,后续所有caption都作为普通block
not_include_blocks.extend(caption_list[i:])
break
# 恢复升序
filtered_captions.reverse()
block[f"{fix_type}_caption_list"] = filtered_captions
# 处理footnote_list (从body往后看,footnote在body之后)
if footnote_list:
# 按index升序排列,从最接近body的开始检查
footnote_list.sort(key=lambda x: x["index"])
filtered_footnotes = [footnote_list[0]]
for i in range(1, len(footnote_list)):
# 检查是否与前一个footnote连续
if footnote_list[i]["index"] == footnote_list[i - 1]["index"] + 1:
filtered_footnotes.append(footnote_list[i])
else:
# 出现gap,后续所有footnote都作为普通block
not_include_blocks.extend(footnote_list[i:])
break
block[f"{fix_type}_footnote_list"] = filtered_footnotes
# 构建两层结构blocks
for block in need_fix_blocks:
body = block[f"{fix_type}_body"]
caption_list = block[f"{fix_type}_caption_list"]
footnote_list = block[f"{fix_type}_footnote_list"]
body["type"] = f"{fix_type}_body"
for caption in caption_list:
caption["type"] = f"{fix_type}_caption"
processed_indices.add(caption["index"])
for footnote in footnote_list:
footnote["type"] = f"{fix_type}_footnote"
processed_indices.add(footnote["index"])
processed_indices.add(body["index"])
two_layer_block = {
"type": fix_type,
"bbox": body["bbox"],
"blocks": [body],
"index": body["index"],
}
two_layer_block["blocks"].extend([*caption_list, *footnote_list])
# 对blocks按index排序
two_layer_block["blocks"].sort(key=lambda x: x["index"])
fixed_blocks.append(two_layer_block)
# 添加未处理的blocks
for block in blocks:
block.pop("type", None)
if block["index"] not in processed_indices and block not in not_include_blocks:
not_include_blocks.append(block)
return fixed_blocks, not_include_blocks
def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
for list_block in list_blocks:
list_block["blocks"] = []
if "lines" in list_block:
del list_block["lines"]
temp_text_blocks = text_blocks + ref_text_blocks
need_remove_blocks = []
for block in temp_text_blocks:
for list_block in list_blocks:
if calculate_overlap_area_in_bbox1_area_ratio(block["bbox"], list_block["bbox"]) >= 0.8:
list_block["blocks"].append(block)
need_remove_blocks.append(block)
break
for block in need_remove_blocks:
if block in text_blocks:
text_blocks.remove(block)
elif block in ref_text_blocks:
ref_text_blocks.remove(block)
# 移除blocks为空的list_block
list_blocks = [lb for lb in list_blocks if lb["blocks"]]
for list_block in list_blocks:
# 统计list_block["blocks"]中所有block的type用众数作为list_block的sub_type
type_count = {}
for sub_block in list_block["blocks"]:
sub_block_type = sub_block["type"]
if sub_block_type not in type_count:
type_count[sub_block_type] = 0
type_count[sub_block_type] += 1
if type_count:
list_block["sub_type"] = max(type_count, key=type_count.get)
else:
list_block["sub_type"] = "unknown"
return list_blocks, text_blocks, ref_text_blocks
def classify_caption_blocks(page_blocks: list) -> list:
"""
对page_blocks中的caption块进行分类将其分类为image_caption或table_caption。
规则:
1. 只有与type为table或image相邻的caption可以作为caption
2. caption块与table或image中相隔的块全部是caption的情况视为该caption块与table或image相邻
3. caption的类型与他前置位相邻的母块type一致table或image如果没有前置位母块则检查是否有后置位母块
4. 没有相邻母块的caption需要变更type为text
"""
if not page_blocks:
return page_blocks
available_types = ["table", "image"]
result_blocks = []
n = len(page_blocks)
for i, block in enumerate(page_blocks):
if block.get("type") != "caption":
result_blocks.append(block)
continue
# 查找前置位相邻的母块table或image
# 向前查找跳过连续的caption块
prev_parent_type = None
j = i - 1
while j >= 0:
prev_block_type = page_blocks[j].get("type")
if prev_block_type in available_types:
prev_parent_type = prev_block_type
break
elif prev_block_type == "caption":
# 继续向前查找
j -= 1
else:
# 遇到非caption且非table/image的块停止查找
break
# 查找后置位相邻的母块table或image
# 向后查找跳过连续的caption块
next_parent_type = None
k = i + 1
while k < n:
next_block_type = page_blocks[k].get("type")
if next_block_type in available_types:
next_parent_type = next_block_type
break
elif next_block_type == "caption":
# 继续向后查找
k += 1
else:
# 遇到非caption且非table/image的块停止查找
break
# 根据规则确定caption类型
new_block = block.copy()
if prev_parent_type:
# 优先使用前置位母块的类型
new_block["type"] = f"{prev_parent_type}_caption"
elif next_parent_type:
# 没有前置位母块,使用后置位母块的类型
new_block["type"] = f"{next_parent_type}_caption"
else:
# 没有相邻母块变更为text
new_block["type"] = "text"
result_blocks.append(new_block)
return result_blocks

View File

@@ -0,0 +1,658 @@
import os
from loguru import logger
from mineru.utils.char_utils import full_to_half_exclude_marks, is_hyphen_at_line_end
from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
from mineru.utils.language import detect_lang
latex_delimiters_config = get_latex_delimiter_config()
default_delimiters = {
'display': {'left': '$$', 'right': '$$'},
'inline': {'left': '$', 'right': '$'}
}
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
display_left_delimiter = delimiters['display']['left']
display_right_delimiter = delimiters['display']['right']
inline_left_delimiter = delimiters['inline']['left']
inline_right_delimiter = delimiters['inline']['right']
def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
block_text = ''
for line in para_block['lines']:
for span in line['spans']:
if span['type'] in [ContentType.TEXT]:
span['content'] = full_to_half_exclude_marks(span['content'])
block_text += span['content']
block_lang = detect_lang(block_text)
para_text = ''
for i, line in enumerate(para_block['lines']):
for j, span in enumerate(line['spans']):
span_type = span['type']
content = ''
if span_type == ContentType.TEXT:
content = span['content']
elif span_type == ContentType.INLINE_EQUATION:
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
elif span_type == ContentType.INTERLINE_EQUATION:
if formula_enable:
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
else:
if span.get('image_path', ''):
content = f"![]({img_buket_path}/{span['image_path']})"
content = content.strip()
if content:
if span_type == ContentType.INTERLINE_EQUATION:
para_text += content
continue
# 定义CJK语言集合(中日韩)
cjk_langs = {'zh', 'ja', 'ko'}
# logger.info(f'block_lang: {block_lang}, content: {content}')
# 判断是否为行末span
is_last_span = j == len(line['spans']) - 1
if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
if is_last_span and span_type != ContentType.INLINE_EQUATION:
para_text += content
else:
para_text += f'{content} '
else:
# 西方文本语境下 每行的最后一个span判断是否要去除连字符
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if (
is_last_span
and span_type == ContentType.TEXT
and is_hyphen_at_line_end(content)
):
# 如果下一行的第一个span是小写字母开头删除连字符
if (
i+1 < len(para_block['lines'])
and para_block['lines'][i + 1].get('spans')
and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
and para_block['lines'][i + 1]['spans'][0].get('content', '')
and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
):
para_text += content[:-1]
else: # 如果没有下一行或者下一行的第一个span不是小写字母开头则保留连字符但不加空格
para_text += content
else: # 西方文本语境下 content间需要空格分隔
para_text += f'{content} '
return para_text
def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
page_markdown = []
for para_block in para_blocks:
para_text = ''
para_type = para_block['type']
if para_type in [BlockType.TEXT, BlockType.INTERLINE_EQUATION, BlockType.PHONETIC, BlockType.REF_TEXT]:
para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path)
elif para_type == BlockType.LIST:
for block in para_block['blocks']:
item_text = merge_para_with_text(block, formula_enable=formula_enable, img_buket_path=img_buket_path)
para_text += f"{item_text} \n"
elif para_type == BlockType.TITLE:
title_level = get_title_level(para_block)
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
elif para_type == BlockType.IMAGE:
if make_mode == MakeMode.NLP_MD:
continue
elif make_mode == MakeMode.MM_MD:
# 检测是否存在图片脚注
has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
# 如果存在图片脚注,则将图片脚注拼接到图片正文后面
if has_image_footnote:
for block in para_block['blocks']: # 1st.拼image_caption
if block['type'] == BlockType.IMAGE_CAPTION:
para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼image_body
if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.IMAGE:
if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 3rd.拼image_footnote
if block['type'] == BlockType.IMAGE_FOOTNOTE:
para_text += ' \n' + merge_para_with_text(block)
else:
for block in para_block['blocks']: # 1st.拼image_body
if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.IMAGE:
if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.IMAGE_CAPTION:
para_text += ' \n' + merge_para_with_text(block)
elif para_type == BlockType.TABLE:
if make_mode == MakeMode.NLP_MD:
continue
elif make_mode == MakeMode.MM_MD:
for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TABLE_CAPTION:
para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼table_body
if block['type'] == BlockType.TABLE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.TABLE:
# if processed by table model
if table_enable:
if span.get('html', ''):
para_text += f"\n{span['html']}\n"
elif span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
else:
if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TABLE_FOOTNOTE:
para_text += '\n' + merge_para_with_text(block) + ' '
elif para_type == BlockType.CODE:
sub_type = para_block["sub_type"]
for block in para_block['blocks']: # 1st.拼code_caption
if block['type'] == BlockType.CODE_CAPTION:
para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼code_body
if block['type'] == BlockType.CODE_BODY:
if sub_type == BlockType.CODE:
guess_lang = para_block["guess_lang"]
para_text += f"```{guess_lang}\n{merge_para_with_text(block)}\n```"
elif sub_type == BlockType.ALGORITHM:
para_text += merge_para_with_text(block)
if para_text.strip() == '':
continue
else:
# page_markdown.append(para_text.strip() + ' ')
page_markdown.append(para_text.strip())
return page_markdown
def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
para_type = para_block['type']
para_content = {}
if para_type in [
BlockType.TEXT,
BlockType.REF_TEXT,
BlockType.PHONETIC,
BlockType.HEADER,
BlockType.FOOTER,
BlockType.PAGE_NUMBER,
BlockType.ASIDE_TEXT,
BlockType.PAGE_FOOTNOTE,
]:
para_content = {
'type': para_type,
'text': merge_para_with_text(para_block),
}
elif para_type == BlockType.LIST:
para_content = {
'type': para_type,
'sub_type': para_block.get('sub_type', ''),
'list_items':[],
}
for block in para_block['blocks']:
item_text = merge_para_with_text(block)
if item_text.strip():
para_content['list_items'].append(item_text)
elif para_type == BlockType.TITLE:
title_level = get_title_level(para_block)
para_content = {
'type': ContentType.TEXT,
'text': merge_para_with_text(para_block),
}
if title_level != 0:
para_content['text_level'] = title_level
elif para_type == BlockType.INTERLINE_EQUATION:
para_content = {
'type': ContentType.EQUATION,
'text': merge_para_with_text(para_block),
'text_format': 'latex',
}
elif para_type == BlockType.IMAGE:
para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
for block in para_block['blocks']:
if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.IMAGE:
if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.IMAGE_CAPTION:
para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
if block['type'] == BlockType.IMAGE_FOOTNOTE:
para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
elif para_type == BlockType.TABLE:
para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
for block in para_block['blocks']:
if block['type'] == BlockType.TABLE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.TABLE:
if span.get('html', ''):
para_content[BlockType.TABLE_BODY] = f"{span['html']}"
if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.TABLE_CAPTION:
para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
if block['type'] == BlockType.TABLE_FOOTNOTE:
para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
elif para_type == BlockType.CODE:
para_content = {'type': BlockType.CODE, 'sub_type': para_block["sub_type"], BlockType.CODE_CAPTION: []}
for block in para_block['blocks']:
if block['type'] == BlockType.CODE_BODY:
para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
if para_block["sub_type"] == BlockType.CODE:
para_content["guess_lang"] = para_block["guess_lang"]
if block['type'] == BlockType.CODE_CAPTION:
para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))
page_width, page_height = page_size
para_bbox = para_block.get('bbox')
if para_bbox:
x0, y0, x1, y1 = para_bbox
para_content['bbox'] = [
int(x0 * 1000 / page_width),
int(y0 * 1000 / page_height),
int(x1 * 1000 / page_width),
int(y1 * 1000 / page_height),
]
para_content['page_idx'] = page_idx
return para_content
def make_blocks_to_content_list_v2(para_block, img_buket_path, page_size):
para_type = para_block['type']
para_content = {}
if para_type in [
BlockType.HEADER,
BlockType.FOOTER,
BlockType.ASIDE_TEXT,
BlockType.PAGE_NUMBER,
BlockType.PAGE_FOOTNOTE,
]:
if para_type == BlockType.HEADER:
content_type = ContentTypeV2.PAGE_HEADER
elif para_type == BlockType.FOOTER:
content_type = ContentTypeV2.PAGE_FOOTER
elif para_type == BlockType.ASIDE_TEXT:
content_type = ContentTypeV2.PAGE_ASIDE_TEXT
elif para_type == BlockType.PAGE_NUMBER:
content_type = ContentTypeV2.PAGE_NUMBER
elif para_type == BlockType.PAGE_FOOTNOTE:
content_type = ContentTypeV2.PAGE_FOOTNOTE
else:
raise ValueError(f"Unknown para_type: {para_type}")
para_content = {
'type': content_type,
'content': {
f"{content_type}_content": merge_para_with_text_v2(para_block),
}
}
elif para_type == BlockType.TITLE:
title_level = get_title_level(para_block)
if title_level != 0:
para_content = {
'type': ContentTypeV2.TITLE,
'content': {
"title_content": merge_para_with_text_v2(para_block),
"level": title_level
}
}
else:
para_content = {
'type': ContentTypeV2.PARAGRAPH,
'content': {
"paragraph_content": merge_para_with_text_v2(para_block),
}
}
elif para_type in [
BlockType.TEXT,
BlockType.PHONETIC
]:
para_content = {
'type': ContentTypeV2.PARAGRAPH,
'content': {
'paragraph_content': merge_para_with_text_v2(para_block),
}
}
elif para_type == BlockType.INTERLINE_EQUATION:
image_path, math_content = get_body_data(para_block)
para_content = {
'type': ContentTypeV2.EQUATION_INTERLINE,
'content': {
'math_content': math_content,
'math_type': 'latex',
'image_source': {'path': f"{img_buket_path}/{image_path}"},
}
}
elif para_type == BlockType.IMAGE:
image_caption = []
image_footnote = []
image_path, _ = get_body_data(para_block)
image_source = {
'path': f"{img_buket_path}/{image_path}",
}
for block in para_block['blocks']:
if block['type'] == BlockType.IMAGE_CAPTION:
image_caption.extend(merge_para_with_text_v2(block))
if block['type'] == BlockType.IMAGE_FOOTNOTE:
image_footnote.extend(merge_para_with_text_v2(block))
para_content = {
'type': ContentTypeV2.IMAGE,
'content': {
'image_source': image_source,
'image_caption': image_caption,
'image_footnote': image_footnote,
}
}
elif para_type == BlockType.TABLE:
table_caption = []
table_footnote = []
image_path, html = get_body_data(para_block)
image_source = {
'path': f"{img_buket_path}/{image_path}",
}
if html.count("<table") > 1:
table_nest_level = 2
else:
table_nest_level = 1
if (
"colspan" in html or
"rowspan" in html or
table_nest_level > 1
):
table_type = ContentTypeV2.TABLE_COMPLEX
else:
table_type = ContentTypeV2.TABLE_SIMPLE
for block in para_block['blocks']:
if block['type'] == BlockType.TABLE_CAPTION:
table_caption.extend(merge_para_with_text_v2(block))
if block['type'] == BlockType.TABLE_FOOTNOTE:
table_footnote.extend(merge_para_with_text_v2(block))
para_content = {
'type': ContentTypeV2.TABLE,
'content': {
'image_source': image_source,
'table_caption': table_caption,
'table_footnote': table_footnote,
'html': html,
'table_type': table_type,
'table_nest_level': table_nest_level,
}
}
elif para_type == BlockType.CODE:
code_caption = []
code_content = []
for block in para_block['blocks']:
if block['type'] == BlockType.CODE_CAPTION:
code_caption.extend(merge_para_with_text_v2(block))
if block['type'] == BlockType.CODE_BODY:
code_content = merge_para_with_text_v2(block)
sub_type = para_block["sub_type"]
if sub_type == BlockType.CODE:
para_content = {
'type': ContentTypeV2.CODE,
'content': {
'code_caption': code_caption,
'code_content': code_content,
'code_language': para_block.get('guess_lang', 'txt'),
}
}
elif sub_type == BlockType.ALGORITHM:
para_content = {
'type': ContentTypeV2.ALGORITHM,
'content': {
'algorithm_caption': code_caption,
'algorithm_content': code_content,
}
}
else:
raise ValueError(f"Unknown code sub_type: {sub_type}")
elif para_type == BlockType.REF_TEXT:
para_content = {
'type': ContentTypeV2.LIST,
'content': {
'list_type': ContentTypeV2.LIST_REF,
'list_items': [
{
'item_type': 'text',
'item_content': merge_para_with_text_v2(para_block),
}
],
}
}
elif para_type == BlockType.LIST:
if 'sub_type' in para_block:
if para_block['sub_type'] == BlockType.REF_TEXT:
list_type = ContentTypeV2.LIST_REF
elif para_block['sub_type'] == BlockType.TEXT:
list_type = ContentTypeV2.LIST_TEXT
else:
raise ValueError(f"Unknown list sub_type: {para_block['sub_type']}")
else:
list_type = ContentTypeV2.LIST_TEXT
list_items = []
for block in para_block['blocks']:
item_content = merge_para_with_text_v2(block)
if item_content:
list_items.append({
'item_type': 'text',
'item_content': item_content,
})
para_content = {
'type': ContentTypeV2.LIST,
'content': {
'list_type': list_type,
'list_items': list_items,
}
}
page_width, page_height = page_size
para_bbox = para_block.get('bbox')
if para_bbox:
x0, y0, x1, y1 = para_bbox
para_content['bbox'] = [
int(x0 * 1000 / page_width),
int(y0 * 1000 / page_height),
int(x1 * 1000 / page_width),
int(y1 * 1000 / page_height),
]
return para_content
def get_body_data(para_block):
"""
Extract image_path and html from para_block
Returns:
- For IMAGE/INTERLINE_EQUATION: (image_path, '')
- For TABLE: (image_path, html)
- Default: ('', '')
"""
def get_data_from_spans(lines):
for line in lines:
for span in line.get('spans', []):
span_type = span.get('type')
if span_type == ContentType.TABLE:
return span.get('image_path', ''), span.get('html', '')
elif span_type == ContentType.IMAGE:
return span.get('image_path', ''), ''
elif span_type == ContentType.INTERLINE_EQUATION:
return span.get('image_path', ''), span.get('content', '')
elif span_type == ContentType.TEXT:
return '', span.get('content', '')
return '', ''
# 处理嵌套的 blocks 结构
if 'blocks' in para_block:
for block in para_block['blocks']:
block_type = block.get('type')
if block_type in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.CODE_BODY]:
result = get_data_from_spans(block.get('lines', []))
if result != ('', ''):
return result
return '', ''
# 处理直接包含 lines 的结构
return get_data_from_spans(para_block.get('lines', []))
def merge_para_with_text_v2(para_block):
block_text = ''
for line in para_block['lines']:
for span in line['spans']:
if span['type'] in [ContentType.TEXT]:
span['content'] = full_to_half_exclude_marks(span['content'])
block_text += span['content']
block_lang = detect_lang(block_text)
para_content = []
para_type = para_block['type']
for i, line in enumerate(para_block['lines']):
for j, span in enumerate(line['spans']):
span_type = span['type']
if span.get("content", '').strip():
if span_type == ContentType.TEXT:
if para_type == BlockType.PHONETIC:
span_type = ContentTypeV2.SPAN_PHONETIC
else:
span_type = ContentTypeV2.SPAN_TEXT
if span_type == ContentType.INLINE_EQUATION:
span_type = ContentTypeV2.SPAN_EQUATION_INLINE
if span_type in [
ContentTypeV2.SPAN_TEXT,
]:
# 定义CJK语言集合(中日韩)
cjk_langs = {'zh', 'ja', 'ko'}
# logger.info(f'block_lang: {block_lang}, content: {content}')
# 判断是否为行末span
is_last_span = j == len(line['spans']) - 1
if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
if is_last_span:
span_content = span['content']
else:
span_content = f"{span['content']} "
else:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if (
is_last_span
and is_hyphen_at_line_end(span['content'])
):
# 如果下一行的第一个span是小写字母开头删除连字符
if (
i + 1 < len(para_block['lines'])
and para_block['lines'][i + 1].get('spans')
and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
and para_block['lines'][i + 1]['spans'][0].get('content', '')
and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
):
span_content = span['content'][:-1]
else: # 如果没有下一行或者下一行的第一个span不是小写字母开头则保留连字符但不加空格
span_content = span['content']
else:
# 西方文本语境下content间需要空格分隔
span_content = f"{span['content']} "
if para_content and para_content[-1]['type'] == span_type:
# 合并相同类型的span
para_content[-1]['content'] += span_content
else:
span_content = {
'type': span_type,
'content': span_content,
}
para_content.append(span_content)
elif span_type in [
ContentTypeV2.SPAN_PHONETIC,
ContentTypeV2.SPAN_EQUATION_INLINE,
]:
span_content = {
'type': span_type,
'content': span['content'],
}
para_content.append(span_content)
else:
logger.warning(f"Unknown span type in merge_para_with_text_v2: {span_type}")
return para_content
def union_make(pdf_info_dict: list,
make_mode: str,
img_buket_path: str = '',
):
formula_enable = get_formula_enable(os.getenv('MINERU_VLM_FORMULA_ENABLE', 'True').lower() == 'true')
table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
output_content = []
for page_info in pdf_info_dict:
paras_of_layout = page_info.get('para_blocks')
paras_of_discarded = page_info.get('discarded_blocks')
page_idx = page_info.get('page_idx')
page_size = page_info.get('page_size')
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
if not paras_of_layout:
continue
page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
output_content.extend(page_markdown)
elif make_mode == MakeMode.CONTENT_LIST:
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
if not para_blocks:
continue
for para_block in para_blocks:
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
output_content.append(para_content)
elif make_mode == MakeMode.CONTENT_LIST_V2:
# https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
page_contents = []
if para_blocks:
for para_block in para_blocks:
para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
page_contents.append(para_content)
output_content.append(page_contents)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content)
elif make_mode in [MakeMode.CONTENT_LIST, MakeMode.CONTENT_LIST_V2]:
return output_content
return None
def get_title_level(block):
title_level = block.get('level', 1)
if title_level > 4:
title_level = 4
elif title_level < 1:
title_level = 0
return title_level

View File

@@ -15,7 +15,7 @@ from mineru.utils.config_reader import get_device
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
from mineru.utils.model_utils import get_vram
from ..version import __version__
from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
from .common import do_parse, read_fn, pdf_suffixes, image_suffixes, office_suffixes
@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
@@ -213,7 +213,7 @@ def main(
if os.path.isdir(input_path):
doc_path_list = []
for doc_path in Path(input_path).glob('*'):
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes + office_suffixes:
doc_path_list.append(doc_path)
parse_doc(doc_path_list)
else:

View File

@@ -15,10 +15,12 @@ from mineru.utils.enum_class import MakeMode
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.backend.office.office_middle_json_mkcontent import union_make as office_union_make
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
from mineru.backend.office.docx_analyze import office_docx_analyze
from mineru.utils.pdf_page_id import get_end_page_id
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
@@ -28,6 +30,10 @@ if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
pdf_suffixes = ["pdf"]
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
docx_suffixes = ["docx"]
pptx_suffixes = ["pptx"]
xlsx_suffixes = ["xlsx"]
office_suffixes = docx_suffixes + pptx_suffixes + xlsx_suffixes
os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -39,7 +45,7 @@ def read_fn(path):
file_suffix = guess_suffix_by_bytes(file_bytes, path)
if file_suffix in image_suffixes:
return images_bytes_to_pdf_bytes(file_bytes)
elif file_suffix in pdf_suffixes:
elif file_suffix in pdf_suffixes + office_suffixes:
return file_bytes
else:
raise Exception(f"Unknown file suffix: {file_suffix}")
@@ -110,10 +116,18 @@ def _process_output(
f_make_md_mode,
middle_json,
model_output=None,
is_pipeline=True
process_mode="vlm",
):
f_draw_line_sort_bbox = False
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
if process_mode == "pipeline":
make_func = pipeline_union_make
elif process_mode == "vlm":
make_func = vlm_union_make
elif process_mode in office_suffixes:
make_func = office_union_make
else:
raise Exception(f"Unknown process_mode: {process_mode}")
"""处理输出文件"""
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
@@ -122,10 +136,16 @@ def _process_output(
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if process_mode in ["pipeline", "vlm"]:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
elif process_mode in office_suffixes:
md_writer.write(
f"{pdf_file_name}_origin.{process_mode}",
pdf_bytes,
)
if f_draw_line_sort_bbox:
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_line_sort.pdf")
@@ -133,7 +153,6 @@ def _process_output(
image_dir = str(os.path.basename(local_image_dir))
if f_dump_md:
make_func = pipeline_union_make if is_pipeline else vlm_union_make
md_content_str = make_func(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
@@ -141,13 +160,12 @@ def _process_output(
)
if f_dump_content_list:
make_func = pipeline_union_make if is_pipeline else vlm_union_make
content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if not is_pipeline:
if process_mode != "pipeline":
content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list_v2.json",
@@ -221,7 +239,7 @@ def _process_pipeline(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, model_json, is_pipeline=True
f_make_md_mode, middle_json, model_json, process_mode="pipeline"
)
@@ -262,7 +280,7 @@ async def _async_process_vlm(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, is_pipeline=False
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
@@ -303,7 +321,7 @@ def _process_vlm(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, is_pipeline=False
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
@@ -355,7 +373,7 @@ def _process_hybrid(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, is_pipeline=False
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
@@ -408,10 +426,56 @@ async def _async_process_hybrid(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, is_pipeline=False
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
)
def _process_office_doc(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
f_dump_md=True,
f_dump_middle_json=True,
f_dump_orig_file=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
):
need_remove_index = []
for i, file_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[i]
file_suffix = guess_suffix_by_bytes(file_bytes)
if file_suffix in docx_suffixes:
need_remove_index.append(i)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"office")
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = office_docx_analyze(
file_bytes,
image_writer=image_writer,
)
f_dump_model_output = False
f_draw_layout_bbox = False
f_draw_span_bbox = False
pdf_info = middle_json["pdf_info"]
_process_output(
pdf_info, file_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_file,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, process_mode="docx"
)
elif file_suffix in pptx_suffixes:
need_remove_index.append(i)
logger.warning(f"Currently, PPTX files are not supported: {pdf_file_name}")
elif file_suffix in xlsx_suffixes:
need_remove_index.append(i)
logger.warning(f"Currently, XLSX files are not supported: {pdf_file_name}")
return need_remove_index
def do_parse(
output_dir,
pdf_file_names: list[str],
@@ -434,6 +498,24 @@ def do_parse(
end_page_id=None,
**kwargs,
):
need_remove_index = _process_office_doc(
output_dir,
pdf_file_names=pdf_file_names,
pdf_bytes_list=pdf_bytes_list,
f_dump_md=f_dump_md,
f_dump_middle_json=f_dump_middle_json,
f_dump_orig_file=f_dump_orig_pdf,
f_dump_content_list=f_dump_content_list,
f_make_md_mode=f_make_md_mode,
)
for index in sorted(need_remove_index, reverse=True):
del pdf_bytes_list[index]
del pdf_file_names[index]
del p_lang_list[index]
if not pdf_bytes_list:
logger.warning("No valid PDF or image files to process.")
return
# 预处理PDF字节数据
pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id)
@@ -506,6 +588,24 @@ async def aio_do_parse(
end_page_id=None,
**kwargs,
):
need_remove_index = _process_office_doc(
output_dir,
pdf_file_names=pdf_file_names,
pdf_bytes_list=pdf_bytes_list,
f_dump_md=f_dump_md,
f_dump_middle_json=f_dump_middle_json,
f_dump_orig_file=f_dump_orig_pdf,
f_dump_content_list=f_dump_content_list,
f_make_md_mode=f_make_md_mode,
)
for index in sorted(need_remove_index, reverse=True):
del pdf_bytes_list[index]
del pdf_file_names[index]
del p_lang_list[index]
if not pdf_bytes_list:
logger.warning("No valid PDF or image files to process.")
return
# 预处理PDF字节数据
pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id)

View File

@@ -22,7 +22,7 @@ logger.add(sys.stderr, level=log_level) # 添加新handler
from base64 import b64encode
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes, office_suffixes
from mineru.utils.cli_parser import arg_parse
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
from mineru.version import __version__
@@ -187,7 +187,7 @@ async def parse_pdf(
# 如果是图像文件或PDF使用read_fn处理
file_suffix = guess_suffix_by_path(temp_path)
if file_suffix in pdf_suffixes + image_suffixes:
if file_suffix in pdf_suffixes + image_suffixes + office_suffixes:
try:
pdf_bytes = read_fn(temp_path)
pdf_bytes_list.append(pdf_bytes)

View File

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

18
mineru/model/docx/main.py Normal file
View File

@@ -0,0 +1,18 @@
from typing import BinaryIO
from mineru.model.docx.docx_converter import DocxConverter
def convert_path(file_path: str):
with open(file_path, "rb") as fh:
return convert_binary(fh)
def convert_binary(file_binary: BinaryIO):
converter = DocxConverter()
converter.convert(file_binary)
return converter.pages
if __name__ == "__main__":
print(convert_path("textbox.docx"))

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 MiB

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/docProps/app.xml" ContentType="application/vnd.openxmlformats-officedocument.extended-properties+xml"/><Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/><Override PartName="/docProps/custom.xml" ContentType="application/vnd.openxmlformats-officedocument.custom-properties+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/><Override PartName="/word/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/><Override PartName="/word/footer1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"/><Override PartName="/word/header1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/header2.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/header3.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/header4.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/settings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"/><Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/><Override PartName="/word/theme/theme1.xml" ContentType="application/vnd.openxmlformats-officedocument.theme+xml"/></Types>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" Target="docProps/app.xml"/><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties" Target="docProps/custom.xml"/></Relationships>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"><Template>Normal.dotm</Template><Pages>8</Pages><Words>0</Words><Characters>0</Characters><Lines>0</Lines><Paragraphs>0</Paragraphs><TotalTime>1</TotalTime><ScaleCrop>false</ScaleCrop><LinksUpToDate>false</LinksUpToDate><CharactersWithSpaces>0</CharactersWithSpaces><Application>WPS Office_12.1.0.17900_F1E327BC-269C-435d-A152-05C5408002CA</Application><DocSecurity>0</DocSecurity></Properties>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dcmitype="http://purl.org/dc/dcmitype/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><dcterms:created xsi:type="dcterms:W3CDTF">2026-01-11T00:35:00Z</dcterms:created><dc:creator>sidney</dc:creator><cp:lastModifiedBy>sidney</cp:lastModifiedBy><dcterms:modified xsi:type="dcterms:W3CDTF">2026-01-09T16:53:33Z</dcterms:modified><cp:revision>1</cp:revision></cp:coreProperties>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"><property fmtid="{D5CDD505-2E9C-101B-9397-08002B2CF9AE}" pid="2" name="KSOProductBuildVer"><vt:lpwstr>2052-12.1.0.17900</vt:lpwstr></property><property fmtid="{D5CDD505-2E9C-101B-9397-08002B2CF9AE}" pid="3" name="ICV"><vt:lpwstr>DFF0AFE5816D6E22E9BD60691B8F4357_41</vt:lpwstr></property></Properties>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId9" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable" Target="fontTable.xml"/><Relationship Id="rId8" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme" Target="theme/theme1.xml"/><Relationship Id="rId7" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header4.xml"/><Relationship Id="rId6" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header3.xml"/><Relationship Id="rId5" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header2.xml"/><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" Target="footer1.xml"/><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header1.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/></Relationships>

View File

@@ -0,0 +1,261 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas"
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing"
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"
xmlns:w10="urn:schemas-microsoft-com:office:word"
xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml"
xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup"
xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk"
xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14">
<w:body>
<w:p w14:paraId="187C1D8D">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>第一节内容</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>1</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:eastAsia="zh-CN"/>
</w:rPr>
<w:br w:type="page"/>
</w:r>
</w:p>
<w:p w14:paraId="7BED7957">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>第一节内容</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>2</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:eastAsia="zh-CN"/>
</w:rPr>
<w:br w:type="page"/>
</w:r>
</w:p>
<w:p w14:paraId="786FF74E">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:eastAsia="zh-CN"/>
</w:rPr>
<w:sectPr>
<w:headerReference r:id="rId3" w:type="default"/>
<w:footerReference r:id="rId4" w:type="default"/>
<w:pgSz w:w="11906" w:h="16838"/>
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="851" w:footer="992"
w:gutter="0"/>
<w:cols w:space="425" w:num="1"/>
<w:docGrid w:type="lines" w:linePitch="312" w:charSpace="0"/>
</w:sectPr>
</w:pPr>
</w:p>
<w:p w14:paraId="544E2025">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>第二节内容</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>1</w:t>
</w:r>
</w:p>
<w:p w14:paraId="620A8F4A">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:br w:type="page"/>
</w:r>
</w:p>
<w:p w14:paraId="1545E12B">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:sectPr>
<w:headerReference r:id="rId5" w:type="default"/>
<w:pgSz w:w="11906" w:h="16838"/>
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="851" w:footer="992"
w:gutter="0"/>
<w:cols w:space="425" w:num="1"/>
<w:docGrid w:type="lines" w:linePitch="312" w:charSpace="0"/>
</w:sectPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>第二节内容2</w:t>
</w:r>
</w:p>
<w:p w14:paraId="206DE633">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>第三节内容1</w:t>
</w:r>
</w:p>
<w:p w14:paraId="35E87C09">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:br w:type="page"/>
</w:r>
</w:p>
<w:p w14:paraId="1977A116">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:sectPr>
<w:headerReference r:id="rId6" w:type="default"/>
<w:pgSz w:w="11906" w:h="16838"/>
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="851" w:footer="992"
w:gutter="0"/>
<w:cols w:space="425" w:num="1"/>
<w:docGrid w:type="lines" w:linePitch="312" w:charSpace="0"/>
</w:sectPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>第三节内容2</w:t>
</w:r>
</w:p>
<w:p w14:paraId="43AB5318">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>第四节内容1</w:t>
</w:r>
</w:p>
<w:p w14:paraId="6FDEB506">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:br w:type="page"/>
</w:r>
</w:p>
<w:p w14:paraId="04B961C8">
<w:pPr>
<w:rPr>
<w:rFonts w:hint="default"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:hint="eastAsia"/>
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
</w:rPr>
<w:t>第四节内容2</w:t>
</w:r>
</w:p>
<w:sectPr>
<w:headerReference r:id="rId7" w:type="default"/>
<w:pgSz w:w="11906" w:h="16838"/>
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="851" w:footer="992"
w:gutter="0"/>
<w:cols w:space="425" w:num="1"/>
<w:docGrid w:type="lines" w:linePitch="312" w:charSpace="0"/>
</w:sectPr>
</w:body>
</w:document>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" mc:Ignorable="w14"><w:font w:name="Times New Roman"><w:panose1 w:val="02020603050405020304"/><w:charset w:val="00"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="20007A87" w:usb1="80000000" w:usb2="00000008" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="宋体"><w:altName w:val="Droid Sans Fallback"/><w:panose1 w:val="00000000000000000000"/><w:charset w:val="00"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000000" w:csb1="00000000"/></w:font><w:font w:name="Wingdings"><w:altName w:val="Noto Color Emoji"/><w:panose1 w:val="05000000000000000000"/><w:charset w:val="02"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/></w:font><w:font w:name="Arial"><w:altName w:val="DejaVu Sans"/><w:panose1 w:val="020B0604020202020204"/><w:charset w:val="01"/><w:family w:val="swiss"/><w:pitch w:val="default"/><w:sig w:usb0="E0002AFF" w:usb1="C0007843" w:usb2="00000009" w:usb3="00000000" w:csb0="400001FF" w:csb1="FFFF0000"/></w:font><w:font w:name="黑体"><w:altName w:val="Droid Sans Fallback"/><w:panose1 w:val="02010609060101010101"/><w:charset w:val="86"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="800002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="00040001" w:csb1="00000000"/></w:font><w:font w:name="Courier New"><w:altName w:val="DejaVu Sans"/><w:panose1 w:val="02070309020205020404"/><w:charset w:val="01"/><w:family w:val="modern"/><w:pitch w:val="default"/><w:sig w:usb0="E0002AFF" w:usb1="C0007843" w:usb2="00000009" w:usb3="00000000" w:csb0="400001FF" w:csb1="FFFF0000"/></w:font><w:font w:name="Symbol"><w:altName w:val="Noto Color Emoji"/><w:panose1 w:val="05050102010706020507"/><w:charset w:val="02"/><w:family w:val="roman"/><w:pitch w:val="default"/><w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/></w:font><w:font w:name="DejaVu Sans"><w:panose1 w:val="020B0603030804020204"/><w:charset w:val="00"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="E7006EFF" w:usb1="D200FDFF" w:usb2="0A246029" w:usb3="0400200C" w:csb0="600001FF" w:csb1="DFFF0000"/></w:font><w:font w:name="Calibri"><w:altName w:val="DejaVu Sans"/><w:panose1 w:val="020F0502020204030204"/><w:charset w:val="00"/><w:family w:val="swiss"/><w:pitch w:val="default"/><w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00000001" w:usb3="00000000" w:csb0="0000019F" w:csb1="00000000"/></w:font><w:font w:name="Droid Sans Fallback"><w:panose1 w:val="020B0502000000000001"/><w:charset w:val="86"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="910002FF" w:usb1="2BDFFCFB" w:usb2="00000036" w:usb3="00000000" w:csb0="203F01FF" w:csb1="D7FF0000"/></w:font><w:font w:name="Noto Color Emoji"><w:panose1 w:val="02000609000000000000"/><w:charset w:val="00"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="00000001" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000001" w:csb1="00000000"/></w:font></w:fonts>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:ftr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="7D3B533F"><w:pPr><w:pStyle w:val="2"/></w:pPr></w:p></w:ftr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="4B194AF1"><w:pPr><w:pStyle w:val="3"/><w:rPr><w:rFonts w:hint="default"/><w:lang w:val="en-US" w:eastAsia="zh-CN"/></w:rPr></w:pPr><w:r><w:rPr><w:rFonts w:hint="eastAsia"/><w:lang w:eastAsia="zh-CN"/></w:rPr><w:t>第一节页眉</w:t></w:r></w:p></w:hdr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="3762EF24"><w:pPr><w:pStyle w:val="3"/><w:rPr><w:rFonts w:hint="default"/><w:lang w:val="en-US" w:eastAsia="zh-CN"/></w:rPr></w:pPr><w:r><w:rPr><w:rFonts w:hint="eastAsia"/><w:lang w:eastAsia="zh-CN"/></w:rPr><w:t>第二节页眉</w:t></w:r></w:p></w:hdr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="4D235CA5"><w:pPr><w:pStyle w:val="3"/><w:rPr><w:rFonts w:hint="default"/><w:lang w:val="en-US" w:eastAsia="zh-CN"/></w:rPr></w:pPr><w:bookmarkStart w:id="0" w:name="_GoBack"/><w:r><w:rPr><w:rFonts w:hint="eastAsia"/><w:lang w:eastAsia="zh-CN"/></w:rPr><w:t>第三节页眉</w:t></w:r></w:p><w:bookmarkEnd w:id="0"/></w:hdr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="17918B03"><w:pPr><w:pStyle w:val="3"/><w:rPr><w:rFonts w:hint="default"/><w:lang w:val="en-US" w:eastAsia="zh-CN"/></w:rPr></w:pPr><w:r><w:rPr><w:rFonts w:hint="eastAsia"/><w:lang w:eastAsia="zh-CN"/></w:rPr><w:t>第四节页眉</w:t></w:r></w:p></w:hdr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:settings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:sl="http://schemas.openxmlformats.org/schemaLibrary/2006/main" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14"><w:zoom w:percent="60"/><w:embedSystemFonts/><w:bordersDoNotSurroundHeader w:val="1"/><w:bordersDoNotSurroundFooter w:val="1"/><w:documentProtection w:enforcement="0"/><w:defaultTabStop w:val="420"/><w:drawingGridVerticalSpacing w:val="156"/><w:displayHorizontalDrawingGridEvery w:val="0"/><w:displayVerticalDrawingGridEvery w:val="2"/><w:characterSpacingControl w:val="compressPunctuation"/><w:compat><w:spaceForUL/><w:balanceSingleByteDoubleByteWidth/><w:doNotLeaveBackslashAlone/><w:ulTrailSpace/><w:doNotExpandShiftReturn/><w:adjustLineHeightInTable/><w:useFELayout/><w:compatSetting w:name="compatibilityMode" w:uri="http://schemas.microsoft.com/office/word" w:val="14"/><w:compatSetting w:name="overrideTableStyleFontSizeAndJustification" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="enableOpenTypeFeatures" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="doNotFlipMirrorIndents" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/></w:compat><w:rsids><w:rsidRoot w:val="B8F19361"/><w:rsid w:val="3FDD74DB"/><w:rsid w:val="5ECF68F6"/><w:rsid w:val="6BC3D698"/><w:rsid w:val="777D451B"/><w:rsid w:val="7F39D9A1"/><w:rsid w:val="B8F19361"/><w:rsid w:val="FDF770BE"/></w:rsids><m:mathPr><m:mathFont m:val="Cambria Math"/><m:brkBin m:val="before"/><m:brkBinSub m:val="--"/><m:smallFrac m:val="0"/><m:dispDef/><m:lMargin m:val="0"/><m:rMargin m:val="0"/><m:defJc m:val="centerGroup"/><m:wrapIndent m:val="1440"/><m:intLim m:val="subSup"/><m:naryLim m:val="undOvr"/></m:mathPr><w:themeFontLang w:val="en-US" w:eastAsia="zh-CN"/><w:clrSchemeMapping w:bg1="light1" w:t1="dark1" w:bg2="light2" w:t2="dark2" w:accent1="accent1" w:accent2="accent2" w:accent3="accent3" w:accent4="accent4" w:accent5="accent5" w:accent6="accent6" w:hyperlink="hyperlink" w:followedHyperlink="followedHyperlink"/><w:doNotIncludeSubdocsInStats/></w:settings>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

BIN
mineru/model/docx/test.docx Normal file

Binary file not shown.

144
mineru/model/docx/test.html Normal file
View File

@@ -0,0 +1,144 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
<p><a id="_Toc411426751"></a><strong>采购合同</strong></p>
<p>合同编号: </p>
<p>签订地点: </p>
<p>签订时间: </p>
<p>采购人(甲方): </p>
<p>供应商(乙方): </p>
<p><a id="_九、其他"></a><a id="_Toc217446115"></a>根据《中华人民共和国政府采购法》、《中华人民共和国合同法》及XX政府采购中心
采购项目项目编号XX的《招标文件》、乙方的《投标文件》及《中标通知书》甲、乙双方同意签订本合同。详细技术说明及其他有关合同项目的特定信息由合同附件予以说明合同附件及本项目的招标文件、投标文件、《中标通知书》等均为本合同不可分割的部分。双方同意共同遵守如下条款
</p>
<p><a id="_Toc308164852"></a><a id="_Toc217446107"></a>一、合同货物</p>
<table>
<tr>
<td rowspan="2"><p>货物</p>
<p>品名</p></td>
<td rowspan="2"><p>规格</p>
<p>型号</p></td>
<td rowspan="2"><p>单位</p></td>
<td rowspan="2"><p>数量</p></td>
<td rowspan="2"><p>单价</p>
<p>(万元)</p></td>
<td rowspan="2"><p>总价(万元)</p></td>
<td colspan="3"><p>资金来源(万元)</p></td>
<td rowspan="2"><p>随机</p>
<p>配件</p></td>
<td rowspan="2"><p>交货期</p></td>
</tr>
<tr>
<td><p>预算内</p></td>
<td><p>预算外</p></td>
<td><p>其他</p></td>
</tr>
<tr>
<td><p> </p></td>
<td><p> </p></td>
<td><p> </p></td>
<td><p> </p></td>
<td><p> </p></td>
<td><p> </p></td>
<td></td>
<td></td>
<td><p> </p></td>
<td></td>
<td><p> </p></td>
</tr>
<tr>
<td><p> </p></td>
<td><p> </p></td>
<td><p> </p></td>
<td><p> </p></td>
<td><p> </p></td>
<td><p> </p></td>
<td></td>
<td></td>
<td><p> </p></td>
<td></td>
<td><p> </p></td>
</tr>
</table>
<p><a id="_Toc308164853"></a><a id="_Toc217446108"></a>二、合同总价</p>
<p>合同总价为人民币大写: 元即RMB¥
元;该合同总价已包括货物设计、材料、制造、包装、运输、安装、调试、检测、验收合格交付使用之前及保修期内保修服务与备用物件等等所有其他有关各项的含税费用。本合同执行期间合同总价不变,甲方无须另向乙方支付本合同规定之外的其他任何费用。</p>
<p><a id="_Toc308164854"></a><a id="_Toc217446109"></a>三、质量要求</p>
<p>1、乙方须提供全新的货物含零部件、配件等表面无划伤、无碰撞痕迹且权属清楚不得侵害他人的知识产权。</p>
<p>2、货物必须符合或优于国家行业 标准,以及本项目招标文件的质量要求和技术指标与出厂标准。</p>
<p>3、乙方须在本合同签订之日起
日内送交货物成品样品给甲方确认,在甲方出具样品确认书并封存成品样品外观尺寸后,乙方才能按样生产,并以此样品作为验收样品;每台货物上均应有产品质量检验合格标志。</p>
<p>
4、货物制造质量出现问题乙方应负责三包包修、包换、包退费用由乙方负担甲方有权到乙方生产场地检查货物质量和生产进度。</p>
<p>5、货到现场后由于甲方保管不当造成的质量问题乙方亦应负责修理但费用由甲方负担。</p>
<p><a id="_Toc308164855"></a><a id="_Toc217446110"></a>四、交货及验收</p>
<p>1、乙方交货期限为合同签订生效后的 日内,在合同签订生效之日起 天内交货到甲方指定地点,随即在
日内全部完成安装调试验收合格交付使用,并且最迟应在 年 月
日前全部完成安装调试验收合格交付使用(如由于采购人的原因造成合同延迟签订或验收的,时间顺延)。交货验收时须提供产品质检部门从同类产品中抽样检查合格的检测报告。</p>
<p>2、验收由甲方组织乙方配合进行</p>
<p>(1) 货物在乙方通知安装调试完毕后 日内初步验收。初步验收合格后,进入 试用期;试用期间发生重大质量问题,修复后试用相应顺延;试用期结束后
日内完成最终验收;</p>
<p>(2)
验收标准:按国家有关规定以及甲方招标文件的质量要求和技术指标、乙方的投标文件及承诺与本合同约定标准进行验收;甲乙双方如对质量要求和技术指标的约定标准有相互抵触或异议的事项,由甲方在招标与投标文件中按质量要求和技术指标比较优胜的原则确定该项的约定标准进行验收;</p>
<p>(3)
验收时如发现所交付的货物有短装、次品、损坏或其它不符合标准及本合同规定之情形者,甲方应做出详尽的现场记录,或由甲乙双方签署备忘录,此现场记录或备忘录可用作补充、缺失和更换损坏部件的有效证据,由此产生的时间延误与有关费用由乙方承担,验收期限相应顺延;</p>
<p>(4) 如质量验收合格,双方签署质量验收报告。</p>
<p>3、货物安装完成后 日内,甲方无故不进行验收工作并已使用货物的,视同已安装调试完成并验收合格。</p>
<p>4、乙方不能完整交付货物必须负责补齐否则视为未按合同约定交货。</p>
<p>5、如货物经乙方
次维修仍不能达到合同约定的质量标准,甲方有权退货,并视作乙方不能交付货物而须支付违约赔偿金给甲方,甲方还可依法追究乙方的违约责任。 </p>
<p><a id="_Toc308164856"></a><a id="_Toc217446111"></a>五、付款方式</p>
<p>1、甲方在本合同签订生效之日起计算款额¥ 元,人民币大写: 元整)后的 日内支付合同金额百分之
的价款(根据磋商文件要求);</p>
<p>2、全部货物安装调试完毕并验收合格之日起甲方接到乙方通知与票据凭证资料以后的 日内,向乙方核拨合同总价的百分之 款项:¥
元,人民币大写 元整;</p>
<p>3、合同履约保证金在货物验收合格满 后,甲方财务部门接到乙方通知和支付凭证资料文件,以及由甲方确认本合同货物质量与服务等约定事项已经履行完毕的正式书面文件后的
日内,递交结算凭证资料给银行并由其向乙方支付价款¥ 元, 人民币大写: 元整(根据招标文件要求);</p>
<p>4、乙方须向甲方出具合法有效完整的完税发票及凭证资料进行支付结算。</p>
<p><a id="_Toc217446112"></a><a id="_Toc308164857"></a>六、售后服务</p>
<p>1、质保期为验收合格后 年,质保期内出现质量问题,乙方在接到通知后 小时内响应到场, 小时内完成维修或更换,并承担修理调换的费用;如货物经乙方
次维修仍不能达到本合同约定的质量标准,视作乙方未能按时交货,甲方有权退货并追究乙方的违约责任。货到现场后由于甲方保管不当造成的问题,乙方亦应负责修复,但费用由甲方负担。</p>
<p>2、乙方须指派专人负责与甲方联系售后服务事宜。 </p>
<p><a id="_Toc217446113"></a><a id="_Toc308164858"></a>七、违约责任</p>
<p>1、甲方违约责任</p>
<p>1 甲方无正当理由拒收货物的,甲方应偿付合同总价百分之 的违约金;</p>
<p>2 甲方逾期支付货款的,除应及时付足货款外,应向乙方偿付欠款总额万分之 /天的违约金;逾期付款超过
天的,乙方有权终止合同;</p>
<p>3 甲方偿付的违约金不足以弥补乙方损失的,还应按乙方损失尚未弥补的部分,支付赔偿金给乙方。</p>
<p>2、乙方违约责任</p>
<p>1乙方交付的货物质量不符合合同规定的乙方应向甲方支付合同总价的百分之
的违约金并须在合同规定的交货时间内更换合格的货物给甲方否则视作乙方不能交付货物而违约按本条本款下述第“2”项规定由乙方偿付违约赔偿金给甲方。</p>
<p>2乙方不能交付货物或逾期交付货物而违约的除应及时交足货物外应向甲方偿付逾期交货部分货款总额的万分之
/天的违约金;逾期交货超过 天,甲方有权终止合同,乙方则应按合同总价的百分之
的款额向甲方偿付赔偿金,并须全额退还甲方已经付给乙方的货款及其利息。</p>
<p>3乙方货物经甲方送交具有法定资格条件的质量技术监督机构检测后如检测结果认定货物质量不符合本合同规定标准的则视为乙方没有按时交货而违约乙方须在
天内无条件更换合格的货物,如逾期不能更换合格的货物,甲方有权终止本合同,乙方应另付合同总价的百分之
的赔偿金给甲方。</p>
<p>4乙方保证本合同货物的权利无瑕疵包括货物所有权及知识产权等权利无瑕疵。如任何第三方经法院或仲裁机构裁决有权对上述货物主张权利或国家机关依法对货物进行没收查处的乙方除应向甲方返还已收款项外还应另按合同总价的百分之
向甲方支付违约金并赔偿因此给甲方造成的一切损失。</p>
<p>5乙方偿付的违约金不足以弥补甲方损失的还应按甲方损失尚未弥补的部分支付赔偿金给甲方。</p>
<p><a id="_Toc308164859"></a><a id="_Toc217446114"></a>八、争议解决办法</p>
<p>
1、因货物的质量问题发生争议由质量技术监督部门或其指定的质量鉴定机构进行质量鉴定。货物符合标准的鉴定费由甲方承担货物不符合质量标准的鉴定费由乙方承担。</p>
<p>2、合同履行期间,若双方发生争议,可协商或由有关部门调解解决,协商或调解不成的,由当事人依法维护其合法权益。</p>
<p><a id="_Toc308164860"></a>九、其他</p>
<p>1、如有未尽事宜由双方依法订立补充合同。</p>
<p>2、本合同双方应加盖骑缝章。</p>
<p>3、本合同一式四份自双方签章并经省政府采购中心审核编号后生效。甲方、乙方、政府采购管理部门、 省政府采购中心各一份。</p>
<p>甲方: (盖章) 乙方: (盖章)</p>
<p>法定代表人(授权代表): 法定代表人(授权代表):</p>
<p>地 址: 地 址:</p>
<p>开户银行: 开户银行:</p>
<p>账号: 账号:</p>
<p>电 话: 电 话:</p>
<p>传 真: 传 真:</p>
<p>签约日期:年 月 日 签约日期: 年 月 </p>
<p><img alt="185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739"
src="output/test/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739.jpg"/></p>
<p>发票 1</p>
</head>
<body>
</body>
</html>

127
mineru/model/docx/test.md Normal file
View File

@@ -0,0 +1,127 @@
**采购合同**
合同编号:
签订地点:
签订时间:
采购人(甲方):
供应商(乙方):
根据《中华人民共和国政府采购法》、《中华人民共和国合同法》及XX政府采购中心 采购项目项目编号XX的《招标文件》、乙方的《投标文件》及《中标通知书》甲、乙双方同意签订本合同。详细技术说明及其他有关合同项目的特定信息由合同附件予以说明合同附件及本项目的招标文件、投标文件、《中标通知书》等均为本合同不可分割的部分。双方同意共同遵守如下条款
一、合同货物
<table><tr><td rowspan="2"><p>货物</p><p>品名</p></td><td rowspan="2"><p>规格</p><p>型号</p></td><td rowspan="2"><p>单位</p></td><td rowspan="2"><p>数量</p></td><td rowspan="2"><p>单价</p><p>(万元)</p></td><td rowspan="2"><p>总价(万元)</p></td><td colspan="3"><p>资金来源(万元)</p></td><td rowspan="2"><p>随机</p><p>配件</p></td><td rowspan="2"><p>交货期</p></td></tr><tr><td><p>预算内</p></td><td><p>预算外</p></td><td><p>其他</p></td></tr><tr><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td></td><td></td><td><p> </p></td><td></td><td><p> </p></td></tr><tr><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td></td><td></td><td><p> </p></td><td></td><td><p> </p></td></tr></table>
二、合同总价
合同总价为人民币大写: 元即RMB¥ 元;该合同总价已包括货物设计、材料、制造、包装、运输、安装、调试、检测、验收合格交付使用之前及保修期内保修服务与备用物件等等所有其他有关各项的含税费用。本合同执行期间合同总价不变,甲方无须另向乙方支付本合同规定之外的其他任何费用。
三、质量要求
1、乙方须提供全新的货物含零部件、配件等表面无划伤、无碰撞痕迹且权属清楚不得侵害他人的知识产权。
2、货物必须符合或优于国家行业 标准,以及本项目招标文件的质量要求和技术指标与出厂标准。
3、乙方须在本合同签订之日起 日内送交货物成品样品给甲方确认,在甲方出具样品确认书并封存成品样品外观尺寸后,乙方才能按样生产,并以此样品作为验收样品;每台货物上均应有产品质量检验合格标志。
4、货物制造质量出现问题乙方应负责三包包修、包换、包退费用由乙方负担甲方有权到乙方生产场地检查货物质量和生产进度。
5、货到现场后由于甲方保管不当造成的质量问题乙方亦应负责修理但费用由甲方负担。
四、交货及验收
1、乙方交货期限为合同签订生效后的 日内,在合同签订生效之日起 天内交货到甲方指定地点,随即在 日内全部完成安装调试验收合格交付使用,并且最迟应在 年 月 日前全部完成安装调试验收合格交付使用(如由于采购人的原因造成合同延迟签订或验收的,时间顺延)。交货验收时须提供产品质检部门从同类产品中抽样检查合格的检测报告。
2、验收由甲方组织乙方配合进行
(1) 货物在乙方通知安装调试完毕后 日内初步验收。初步验收合格后,进入 试用期;试用期间发生重大质量问题,修复后试用相应顺延;试用期结束后 日内完成最终验收;
(2) 验收标准:按国家有关规定以及甲方招标文件的质量要求和技术指标、乙方的投标文件及承诺与本合同约定标准进行验收;甲乙双方如对质量要求和技术指标的约定标准有相互抵触或异议的事项,由甲方在招标与投标文件中按质量要求和技术指标比较优胜的原则确定该项的约定标准进行验收;
(3) 验收时如发现所交付的货物有短装、次品、损坏或其它不符合标准及本合同规定之情形者,甲方应做出详尽的现场记录,或由甲乙双方签署备忘录,此现场记录或备忘录可用作补充、缺失和更换损坏部件的有效证据,由此产生的时间延误与有关费用由乙方承担,验收期限相应顺延;
(4) 如质量验收合格,双方签署质量验收报告。
3、货物安装完成后 日内,甲方无故不进行验收工作并已使用货物的,视同已安装调试完成并验收合格。
4、乙方不能完整交付货物必须负责补齐否则视为未按合同约定交货。
5、如货物经乙方 次维修仍不能达到合同约定的质量标准,甲方有权退货,并视作乙方不能交付货物而须支付违约赔偿金给甲方,甲方还可依法追究乙方的违约责任。
五、付款方式
1、甲方在本合同签订生效之日起计算款额¥ 元,人民币大写: 元整)后的 日内支付合同金额百分之 的价款(根据磋商文件要求);
2、全部货物安装调试完毕并验收合格之日起甲方接到乙方通知与票据凭证资料以后的 日内,向乙方核拨合同总价的百分之 款项:¥ 元,人民币大写 元整;
3、合同履约保证金在货物验收合格满 后,甲方财务部门接到乙方通知和支付凭证资料文件,以及由甲方确认本合同货物质量与服务等约定事项已经履行完毕的正式书面文件后的 日内,递交结算凭证资料给银行并由其向乙方支付价款¥ 元, 人民币大写: 元整(根据招标文件要求);
4、乙方须向甲方出具合法有效完整的完税发票及凭证资料进行支付结算。
六、售后服务
1、质保期为验收合格后 年,质保期内出现质量问题,乙方在接到通知后 小时内响应到场, 小时内完成维修或更换,并承担修理调换的费用;如货物经乙方 次维修仍不能达到本合同约定的质量标准,视作乙方未能按时交货,甲方有权退货并追究乙方的违约责任。货到现场后由于甲方保管不当造成的问题,乙方亦应负责修复,但费用由甲方负担。
2、乙方须指派专人负责与甲方联系售后服务事宜。
七、违约责任
1、甲方违约责任
1 甲方无正当理由拒收货物的,甲方应偿付合同总价百分之 的违约金;
2 甲方逾期支付货款的,除应及时付足货款外,应向乙方偿付欠款总额万分之 /天的违约金;逾期付款超过 天的,乙方有权终止合同;
3 甲方偿付的违约金不足以弥补乙方损失的,还应按乙方损失尚未弥补的部分,支付赔偿金给乙方。
2、乙方违约责任
1乙方交付的货物质量不符合合同规定的乙方应向甲方支付合同总价的百分之 的违约金并须在合同规定的交货时间内更换合格的货物给甲方否则视作乙方不能交付货物而违约按本条本款下述第“2”项规定由乙方偿付违约赔偿金给甲方。
2乙方不能交付货物或逾期交付货物而违约的除应及时交足货物外应向甲方偿付逾期交货部分货款总额的万分之 /天的违约金;逾期交货超过 天,甲方有权终止合同,乙方则应按合同总价的百分之 的款额向甲方偿付赔偿金,并须全额退还甲方已经付给乙方的货款及其利息。
3乙方货物经甲方送交具有法定资格条件的质量技术监督机构检测后如检测结果认定货物质量不符合本合同规定标准的则视为乙方没有按时交货而违约乙方须在 天内无条件更换合格的货物,如逾期不能更换合格的货物,甲方有权终止本合同,乙方应另付合同总价的百分之 的赔偿金给甲方。
4乙方保证本合同货物的权利无瑕疵包括货物所有权及知识产权等权利无瑕疵。如任何第三方经法院或仲裁机构裁决有权对上述货物主张权利或国家机关依法对货物进行没收查处的乙方除应向甲方返还已收款项外还应另按合同总价的百分之 向甲方支付违约金并赔偿因此给甲方造成的一切损失。
5乙方偿付的违约金不足以弥补甲方损失的还应按甲方损失尚未弥补的部分支付赔偿金给甲方。
八、争议解决办法
1、因货物的质量问题发生争议由质量技术监督部门或其指定的质量鉴定机构进行质量鉴定。货物符合标准的鉴定费由甲方承担货物不符合质量标准的鉴定费由乙方承担。
2、合同履行期间,若双方发生争议,可协商或由有关部门调解解决,协商或调解不成的,由当事人依法维护其合法权益。
九、其他
1、如有未尽事宜由双方依法订立补充合同。
2、本合同双方应加盖骑缝章。
3、本合同一式四份自双方签章并经省政府采购中心审核编号后生效。甲方、乙方、政府采购管理部门、 省政府采购中心各一份。
甲方: (盖章) 乙方: (盖章)
法定代表人(授权代表): 法定代表人(授权代表):
地 址: 地 址:
开户银行: 开户银行:
账号: 账号:
电 话: 电 话:
传 真: 传 真:
签约日期:年 月 日 签约日期: 年 月
![185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739](test.png)
发票 1
进程已结束,退出代码为 0

BIN
mineru/model/docx/test.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 MiB

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="jpeg" ContentType="image/jpeg"/><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/><Override PartName="/customXml/itemProps1.xml" ContentType="application/vnd.openxmlformats-officedocument.customXmlProperties+xml"/><Override PartName="/word/numbering.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml"/><Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/><Override PartName="/word/settings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"/><Override PartName="/word/webSettings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml"/><Override PartName="/word/footnotes.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml"/><Override PartName="/word/endnotes.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml"/><Override PartName="/word/header1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/header2.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/footer1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"/><Override PartName="/word/footer2.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"/><Override PartName="/word/header3.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/footer3.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"/><Override PartName="/word/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/><Override PartName="/word/glossary/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml"/><Override PartName="/word/glossary/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/><Override PartName="/word/glossary/settings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"/><Override PartName="/word/glossary/webSettings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml"/><Override PartName="/word/glossary/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/><Override PartName="/word/theme/theme1.xml" ContentType="application/vnd.openxmlformats-officedocument.theme+xml"/><Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/><Override PartName="/docProps/app.xml" ContentType="application/vnd.openxmlformats-officedocument.extended-properties+xml"/><Override PartName="/docProps/custom.xml" ContentType="application/vnd.openxmlformats-officedocument.custom-properties+xml"/></Types>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" Target="docProps/app.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties" Target="docProps/custom.xml"/></Relationships>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXmlProps" Target="itemProps1.xml"/></Relationships>

View File

@@ -0,0 +1 @@
<?xml version="1.0" standalone="no"?><b:Sources xmlns:b="http://schemas.openxmlformats.org/officeDocument/2006/bibliography" xmlns="http://schemas.openxmlformats.org/officeDocument/2006/bibliography" SelectedStyle="\APASixthEditionOfficeOnline.xsl" StyleName="APA" Version="6"></b:Sources>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<ds:datastoreItem ds:itemID="{247BE1E9-2D8F-4D80-862D-FA139EE27171}" xmlns:ds="http://schemas.openxmlformats.org/officeDocument/2006/customXml"><ds:schemaRefs><ds:schemaRef ds:uri="http://schemas.openxmlformats.org/officeDocument/2006/bibliography"/></ds:schemaRefs></ds:datastoreItem>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"><Template>{2447CD0F-22A7-4A1B-A9BC-34FF8F943A34}TFf1603197-d3d8-44fc-95f7-0445aa29d9afca513b70_win32-e2ceec99d124.dotx</Template><TotalTime>6</TotalTime><Pages>2</Pages><Words>82</Words><Characters>468</Characters><Application>Microsoft Office Word</Application><DocSecurity>0</DocSecurity><Lines>3</Lines><Paragraphs>1</Paragraphs><ScaleCrop>false</ScaleCrop><HeadingPairs><vt:vector size="2" baseType="variant"><vt:variant><vt:lpstr>Title</vt:lpstr></vt:variant><vt:variant><vt:i4>1</vt:i4></vt:variant></vt:vector></HeadingPairs><TitlesOfParts><vt:vector size="1" baseType="lpstr"><vt:lpstr></vt:lpstr></vt:vector></TitlesOfParts><Company></Company><LinksUpToDate>false</LinksUpToDate><CharactersWithSpaces>549</CharactersWithSpaces><SharedDoc>false</SharedDoc><HyperlinksChanged>false</HyperlinksChanged><AppVersion>16.0000</AppVersion></Properties>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dcmitype="http://purl.org/dc/dcmitype/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><dc:title></dc:title><dc:subject></dc:subject><dc:creator>Sidney Chen</dc:creator><cp:keywords></cp:keywords><dc:description></dc:description><cp:lastModifiedBy>Sidney Chen</cp:lastModifiedBy><cp:revision>1</cp:revision><dcterms:created xsi:type="dcterms:W3CDTF">2025-11-04T08:12:00Z</dcterms:created><dcterms:modified xsi:type="dcterms:W3CDTF">2025-11-04T08:18:00Z</dcterms:modified></cp:coreProperties>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"><property fmtid="{D5CDD505-2E9C-101B-9397-08002B2CF9AE}" pid="2" name="ContentTypeId"><vt:lpwstr>0x010100AA3F7D94069FF64A86F7DFF56D60E3BE</vt:lpwstr></property></Properties>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId8" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.jpeg"/><Relationship Id="rId13" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header3.xml"/><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/><Relationship Id="rId7" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes" Target="endnotes.xml"/><Relationship Id="rId12" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" Target="footer2.xml"/><Relationship Id="rId17" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme" Target="theme/theme1.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering" Target="numbering.xml"/><Relationship Id="rId16" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/glossaryDocument" Target="glossary/document.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXml" Target="../customXml/item1.xml"/><Relationship Id="rId6" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes" Target="footnotes.xml"/><Relationship Id="rId11" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" Target="footer1.xml"/><Relationship Id="rId5" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings" Target="webSettings.xml"/><Relationship Id="rId15" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable" Target="fontTable.xml"/><Relationship Id="rId10" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header2.xml"/><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/><Relationship Id="rId9" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header1.xml"/><Relationship Id="rId14" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" Target="footer3.xml"/></Relationships>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/attachedTemplate" Target="file:///C:\Users\sidney\AppData\Local\Microsoft\Office\16.0\DTS\zh-CN%7bFCF86E55-83C6-41EC-B02E-A4F386444BB8%7d\%7b2447CD0F-22A7-4A1B-A9BC-34FF8F943A34%7dTFf1603197-d3d8-44fc-95f7-0445aa29d9afca513b70_win32-e2ceec99d124.dotx" TargetMode="External"/></Relationships>

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:endnotes xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:endnote w:type="separator" w:id="-1"><w:p w14:paraId="066749CB" w14:textId="77777777" w:rsidR="00DE3E16" w:rsidRDefault="00DE3E16"><w:pPr><w:spacing w:line="240" w:lineRule="auto"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:separator/></w:r></w:p></w:endnote><w:endnote w:type="continuationSeparator" w:id="0"><w:p w14:paraId="1D7CA5B8" w14:textId="77777777" w:rsidR="00DE3E16" w:rsidRDefault="00DE3E16"><w:pPr><w:spacing w:line="240" w:lineRule="auto"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:continuationSeparator/></w:r></w:p></w:endnote></w:endnotes>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:font w:name="Symbol"><w:panose1 w:val="05050102010706020507"/><w:charset w:val="02"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="00000000" w:usb1="10000000" w:usb2="00000000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/></w:font><w:font w:name="Times New Roman"><w:panose1 w:val="02020603050405020304"/><w:charset w:val="00"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="Arial"><w:panose1 w:val="020B0604020202020204"/><w:charset w:val="00"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="MS PGothic"><w:panose1 w:val="020B0600070205080204"/><w:charset w:val="80"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E00002FF" w:usb1="6AC7FDFB" w:usb2="08000012" w:usb3="00000000" w:csb0="0002009F" w:csb1="00000000"/></w:font><w:font w:name="Microsoft YaHei UI"><w:panose1 w:val="020B0503020204020204"/><w:charset w:val="86"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="80000287" w:usb1="2ACF3C50" w:usb2="00000016" w:usb3="00000000" w:csb0="0004001F" w:csb1="00000000"/></w:font><w:font w:name="黑体"><w:altName w:val="SimHei"/><w:panose1 w:val="02010609060101010101"/><w:charset w:val="86"/><w:family w:val="modern"/><w:pitch w:val="fixed"/><w:sig w:usb0="800002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="00040001" w:csb1="00000000"/></w:font></w:fonts>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:ftr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="71759E2F" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="af0"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:ftr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:ftr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:sdt><w:sdtPr><w:id w:val="107940834"/><w:docPartObj><w:docPartGallery w:val="Page Numbers (Bottom of Page)"/><w:docPartUnique/></w:docPartObj></w:sdtPr><w:sdtEndPr><w:rPr><w:noProof/></w:rPr></w:sdtEndPr><w:sdtContent><w:p w14:paraId="34CD4779" w14:textId="77777777" w:rsidR="008049DB" w:rsidRDefault="006A3739"><w:pPr><w:pStyle w:val="af0"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:rPr><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:fldChar w:fldCharType="begin"/></w:r><w:r><w:rPr><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:instrText xml:space="preserve"> PAGE \* MERGEFORMAT </w:instrText></w:r><w:r><w:rPr><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:fldChar w:fldCharType="separate"/></w:r><w:r w:rsidR="00793AFB"><w:rPr><w:noProof/><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:t>2</w:t></w:r><w:r><w:rPr><w:noProof/><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:fldChar w:fldCharType="end"/></w:r></w:p></w:sdtContent></w:sdt></w:ftr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:ftr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="06BA2A24" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="af0"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:ftr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:footnotes xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:footnote w:type="separator" w:id="-1"><w:p w14:paraId="0D637794" w14:textId="77777777" w:rsidR="00DE3E16" w:rsidRDefault="00DE3E16"><w:pPr><w:spacing w:line="240" w:lineRule="auto"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:separator/></w:r></w:p></w:footnote><w:footnote w:type="continuationSeparator" w:id="0"><w:p w14:paraId="5D637BFB" w14:textId="77777777" w:rsidR="00DE3E16" w:rsidRDefault="00DE3E16"><w:pPr><w:spacing w:line="240" w:lineRule="auto"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:continuationSeparator/></w:r></w:p></w:footnote></w:footnotes>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings" Target="webSettings.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable" Target="fontTable.xml"/></Relationships>

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:font w:name="Symbol"><w:panose1 w:val="05050102010706020507"/><w:charset w:val="02"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="00000000" w:usb1="10000000" w:usb2="00000000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/></w:font><w:font w:name="Times New Roman"><w:panose1 w:val="02020603050405020304"/><w:charset w:val="00"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="Arial"><w:panose1 w:val="020B0604020202020204"/><w:charset w:val="00"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="MS PGothic"><w:panose1 w:val="020B0600070205080204"/><w:charset w:val="80"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E00002FF" w:usb1="6AC7FDFB" w:usb2="08000012" w:usb3="00000000" w:csb0="0002009F" w:csb1="00000000"/></w:font><w:font w:name="Microsoft YaHei UI"><w:panose1 w:val="020B0503020204020204"/><w:charset w:val="86"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="80000287" w:usb1="2ACF3C50" w:usb2="00000016" w:usb3="00000000" w:csb0="0004001F" w:csb1="00000000"/></w:font><w:font w:name="黑体"><w:altName w:val="SimHei"/><w:panose1 w:val="02010609060101010101"/><w:charset w:val="86"/><w:family w:val="modern"/><w:pitch w:val="fixed"/><w:sig w:usb0="800002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="00040001" w:csb1="00000000"/></w:font><w:font w:name="等线"><w:altName w:val="DengXian"/><w:panose1 w:val="02010600030101010101"/><w:charset w:val="86"/><w:family w:val="auto"/><w:pitch w:val="variable"/><w:sig w:usb0="A00002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="0004000F" w:csb1="00000000"/></w:font><w:font w:name="等线 Light"><w:panose1 w:val="02010600030101010101"/><w:charset w:val="86"/><w:family w:val="auto"/><w:pitch w:val="variable"/><w:sig w:usb0="A00002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="0004000F" w:csb1="00000000"/></w:font></w:fonts>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:settings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:sl="http://schemas.openxmlformats.org/schemaLibrary/2006/main" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:view w:val="normal"/><w:bordersDoNotSurroundHeader/><w:bordersDoNotSurroundFooter/><w:defaultTabStop w:val="420"/><w:drawingGridVerticalSpacing w:val="156"/><w:displayHorizontalDrawingGridEvery w:val="0"/><w:displayVerticalDrawingGridEvery w:val="2"/><w:characterSpacingControl w:val="compressPunctuation"/><w:compat><w:spaceForUL/><w:balanceSingleByteDoubleByteWidth/><w:doNotLeaveBackslashAlone/><w:ulTrailSpace/><w:doNotExpandShiftReturn/><w:adjustLineHeightInTable/><w:useFELayout/><w:compatSetting w:name="compatibilityMode" w:uri="http://schemas.microsoft.com/office/word" w:val="15"/><w:compatSetting w:name="overrideTableStyleFontSizeAndJustification" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="enableOpenTypeFeatures" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="doNotFlipMirrorIndents" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="differentiateMultirowTableHeaders" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="useWord2013TrackBottomHyphenation" w:uri="http://schemas.microsoft.com/office/word" w:val="0"/></w:compat><w:rsids><w:rsidRoot w:val="007012A4"/><w:rsid w:val="007012A4"/><w:rsid w:val="008049C9"/></w:rsids><m:mathPr><m:mathFont m:val="Cambria Math"/><m:brkBin m:val="before"/><m:brkBinSub m:val="--"/><m:smallFrac m:val="0"/><m:dispDef/><m:lMargin m:val="0"/><m:rMargin m:val="0"/><m:defJc m:val="centerGroup"/><m:wrapIndent m:val="1440"/><m:intLim m:val="subSup"/><m:naryLim m:val="undOvr"/></m:mathPr><w:themeFontLang w:val="en-US" w:eastAsia="zh-CN"/><w:clrSchemeMapping w:bg1="light1" w:t1="dark1" w:bg2="light2" w:t2="dark2" w:accent1="accent1" w:accent2="accent2" w:accent3="accent3" w:accent4="accent4" w:accent5="accent5" w:accent6="accent6" w:hyperlink="hyperlink" w:followedHyperlink="followedHyperlink"/><w:decimalSymbol w:val="."/><w:listSeparator w:val=","/><w15:chartTrackingRefBased/></w:settings>

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:webSettings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:optimizeForBrowser/><w:allowPNG/></w:webSettings>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="1CDCFD2F" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="ae"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:hdr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="777F27D5" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="ae"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:hdr>

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="0A1573FC" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="ae"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:hdr>

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:settings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:sl="http://schemas.openxmlformats.org/schemaLibrary/2006/main" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:zoom w:percent="100"/><w:bordersDoNotSurroundHeader/><w:bordersDoNotSurroundFooter/><w:proofState w:spelling="clean" w:grammar="clean"/><w:attachedTemplate r:id="rId1"/><w:defaultTabStop w:val="720"/><w:characterSpacingControl w:val="doNotCompress"/><w:hdrShapeDefaults><o:shapedefaults v:ext="edit" spidmax="2050"/></w:hdrShapeDefaults><w:footnotePr><w:footnote w:id="-1"/><w:footnote w:id="0"/></w:footnotePr><w:endnotePr><w:endnote w:id="-1"/><w:endnote w:id="0"/></w:endnotePr><w:compat><w:useFELayout/><w:compatSetting w:name="compatibilityMode" w:uri="http://schemas.microsoft.com/office/word" w:val="15"/><w:compatSetting w:name="overrideTableStyleFontSizeAndJustification" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="enableOpenTypeFeatures" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="doNotFlipMirrorIndents" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="differentiateMultirowTableHeaders" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="useWord2013TrackBottomHyphenation" w:uri="http://schemas.microsoft.com/office/word" w:val="0"/></w:compat><w:rsids><w:rsidRoot w:val="00365A38"/><w:rsid w:val="00055AF8"/><w:rsid w:val="001365E4"/><w:rsid w:val="002558FA"/><w:rsid w:val="00323F56"/><w:rsid w:val="00365A38"/><w:rsid w:val="003667F4"/><w:rsid w:val="00616194"/><w:rsid w:val="006924B3"/><w:rsid w:val="006A3739"/><w:rsid w:val="007577D4"/><w:rsid w:val="00793AFB"/><w:rsid w:val="007D3668"/><w:rsid w:val="008049C9"/><w:rsid w:val="008049DB"/><w:rsid w:val="00837ECD"/><w:rsid w:val="00897AA4"/><w:rsid w:val="00907574"/><w:rsid w:val="00934F6F"/><w:rsid w:val="00966901"/><w:rsid w:val="00981A82"/><w:rsid w:val="00A93410"/><w:rsid w:val="00B76A92"/><w:rsid w:val="00BB4862"/><w:rsid w:val="00BF2506"/><w:rsid w:val="00C3067E"/><w:rsid w:val="00C5328D"/><w:rsid w:val="00CE7F7E"/><w:rsid w:val="00CF07F2"/><w:rsid w:val="00D934CD"/><w:rsid w:val="00DE3E16"/><w:rsid w:val="00E4324B"/><w:rsid w:val="00E9657B"/><w:rsid w:val="00F011A8"/></w:rsids><m:mathPr><m:mathFont m:val="Cambria Math"/><m:brkBin m:val="before"/><m:brkBinSub m:val="--"/><m:smallFrac m:val="0"/><m:dispDef/><m:lMargin m:val="0"/><m:rMargin m:val="0"/><m:defJc m:val="centerGroup"/><m:wrapIndent m:val="1440"/><m:intLim m:val="subSup"/><m:naryLim m:val="undOvr"/></m:mathPr><w:themeFontLang w:val="en-US" w:eastAsia="ja-JP" w:bidi="ar-SA"/><w:clrSchemeMapping w:bg1="light1" w:t1="dark1" w:bg2="light2" w:t2="dark2" w:accent1="accent1" w:accent2="accent2" w:accent3="accent3" w:accent4="accent4" w:accent5="accent5" w:accent6="accent6" w:hyperlink="hyperlink" w:followedHyperlink="followedHyperlink"/><w:shapeDefaults><o:shapedefaults v:ext="edit" spidmax="2050"/><o:shapelayout v:ext="edit"><o:idmap v:ext="edit" data="2"/></o:shapelayout></w:shapeDefaults><w:decimalSymbol w:val="."/><w:listSeparator w:val=","/><w14:docId w14:val="79E9351B"/><w15:chartTrackingRefBased/><w15:docId w15:val="{53785501-C96A-4E1A-AEE6-3D202E3FAC5F}"/></w:settings>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:webSettings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:optimizeForBrowser/><w:allowPNG/></w:webSettings>

Binary file not shown.

View File

@@ -0,0 +1 @@
# Copyright (c) Opendatalab. All rights reserved.

View File

View File

@@ -0,0 +1,274 @@
"""
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
On 23/01/2025
"""
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
BLANK = ""
BACKSLASH = "\\"
ALN = "&"
CHR = {
# Unicode : Latex Math Symbols
# Top accents
"\u0300": "\\grave{{{0}}}",
"\u0301": "\\acute{{{0}}}",
"\u0302": "\\hat{{{0}}}",
"\u0303": "\\tilde{{{0}}}",
"\u0304": "\\bar{{{0}}}",
"\u0305": "\\overbar{{{0}}}",
"\u0306": "\\breve{{{0}}}",
"\u0307": "\\dot{{{0}}}",
"\u0308": "\\ddot{{{0}}}",
"\u0309": "\\ovhook{{{0}}}",
"\u030a": "\\ocirc{{{0}}}}",
"\u030c": "\\check{{{0}}}}",
"\u0310": "\\candra{{{0}}}",
"\u0312": "\\oturnedcomma{{{0}}}",
"\u0315": "\\ocommatopright{{{0}}}",
"\u031a": "\\droang{{{0}}}",
"\u0338": "\\not{{{0}}}",
"\u20d0": "\\leftharpoonaccent{{{0}}}",
"\u20d1": "\\rightharpoonaccent{{{0}}}",
"\u20d2": "\\vertoverlay{{{0}}}",
"\u20d6": "\\overleftarrow{{{0}}}",
"\u20d7": "\\vec{{{0}}}",
"\u20db": "\\dddot{{{0}}}",
"\u20dc": "\\ddddot{{{0}}}",
"\u20e1": "\\overleftrightarrow{{{0}}}",
"\u20e7": "\\annuity{{{0}}}",
"\u20e9": "\\widebridgeabove{{{0}}}",
"\u20f0": "\\asteraccent{{{0}}}",
# Bottom accents
"\u0330": "\\wideutilde{{{0}}}",
"\u0331": "\\underbar{{{0}}}",
"\u20e8": "\\threeunderdot{{{0}}}",
"\u20ec": "\\underrightharpoondown{{{0}}}",
"\u20ed": "\\underleftharpoondown{{{0}}}",
"\u20ee": "\\underledtarrow{{{0}}}",
"\u20ef": "\\underrightarrow{{{0}}}",
# Over | group
"\u23b4": "\\overbracket{{{0}}}",
"\u23dc": "\\overparen{{{0}}}",
"\u23de": "\\overbrace{{{0}}}",
# Under| group
"\u23b5": "\\underbracket{{{0}}}",
"\u23dd": "\\underparen{{{0}}}",
"\u23df": "\\underbrace{{{0}}}",
}
CHR_BO = {
# Big operators,
"\u2140": "\\Bbbsum",
"\u220f": "\\prod",
"\u2210": "\\coprod",
"\u2211": "\\sum",
"\u222b": "\\int",
"\u222c": "\\iint",
"\u222d": "\\iiint",
"\u222e": "\\oint",
"\u222f": "\\oiint",
"\u2230": "\\oiiint",
"\u22c0": "\\bigwedge",
"\u22c1": "\\bigvee",
"\u22c2": "\\bigcap",
"\u22c3": "\\bigcup",
"\u2a00": "\\bigodot",
"\u2a01": "\\bigoplus",
"\u2a02": "\\bigotimes",
}
T = {
# Greek letters
"\U0001d6fc": "\\alpha ",
"\U0001d6fd": "\\beta ",
"\U0001d6fe": "\\gamma ",
"\U0001d6ff": "\\theta ",
"\U0001d700": "\\epsilon ",
"\U0001d701": "\\zeta ",
"\U0001d702": "\\eta ",
"\U0001d703": "\\theta ",
"\U0001d704": "\\iota ",
"\U0001d705": "\\kappa ",
"\U0001d706": "\\lambda ",
"\U0001d707": "\\m ",
"\U0001d708": "\\n ",
"\U0001d709": "\\xi ",
"\U0001d70a": "\\omicron ",
"\U0001d70b": "\\pi ",
"\U0001d70c": "\\rho ",
"\U0001d70d": "\\varsigma ",
"\U0001d70e": "\\sigma ",
"\U0001d70f": "\\ta ",
"\U0001d710": "\\upsilon ",
"\U0001d711": "\\phi ",
"\U0001d712": "\\chi ",
"\U0001d713": "\\psi ",
"\U0001d714": "\\omega ",
"\U0001d715": "\\partial ",
"\U0001d716": "\\varepsilon ",
"\U0001d717": "\\vartheta ",
"\U0001d718": "\\varkappa ",
"\U0001d719": "\\varphi ",
"\U0001d71a": "\\varrho ",
"\U0001d71b": "\\varpi ",
# Relation symbols
"\u2190": "\\leftarrow ",
"\u2191": "\\uparrow ",
"\u2192": "\\rightarrow ",
"\u2193": "\\downright ",
"\u2194": "\\leftrightarrow ",
"\u2195": "\\updownarrow ",
"\u2196": "\\nwarrow ",
"\u2197": "\\nearrow ",
"\u2198": "\\searrow ",
"\u2199": "\\swarrow ",
"\u22ee": "\\vdots ",
"\u22ef": "\\cdots ",
"\u22f0": "\\adots ",
"\u22f1": "\\ddots ",
"\u2260": "\\ne ",
"\u2264": "\\leq ",
"\u2265": "\\geq ",
"\u2266": "\\leqq ",
"\u2267": "\\geqq ",
"\u2268": "\\lneqq ",
"\u2269": "\\gneqq ",
"\u226a": "\\ll ",
"\u226b": "\\gg ",
"\u2208": "\\in ",
"\u2209": "\\notin ",
"\u220b": "\\ni ",
"\u220c": "\\nni ",
# Ordinary symbols
"\u221e": "\\infty ",
# Binary relations
"\u00b1": "\\pm ",
"\u2213": "\\mp ",
# Italic, Latin, uppercase
"\U0001d434": "A",
"\U0001d435": "B",
"\U0001d436": "C",
"\U0001d437": "D",
"\U0001d438": "E",
"\U0001d439": "F",
"\U0001d43a": "G",
"\U0001d43b": "H",
"\U0001d43c": "I",
"\U0001d43d": "J",
"\U0001d43e": "K",
"\U0001d43f": "L",
"\U0001d440": "M",
"\U0001d441": "N",
"\U0001d442": "O",
"\U0001d443": "P",
"\U0001d444": "Q",
"\U0001d445": "R",
"\U0001d446": "S",
"\U0001d447": "T",
"\U0001d448": "U",
"\U0001d449": "V",
"\U0001d44a": "W",
"\U0001d44b": "X",
"\U0001d44c": "Y",
"\U0001d44d": "Z",
# Italic, Latin, lowercase
"\U0001d44e": "a",
"\U0001d44f": "b",
"\U0001d450": "c",
"\U0001d451": "d",
"\U0001d452": "e",
"\U0001d453": "f",
"\U0001d454": "g",
"\U0001d456": "i",
"\U0001d457": "j",
"\U0001d458": "k",
"\U0001d459": "l",
"\U0001d45a": "m",
"\U0001d45b": "n",
"\U0001d45c": "o",
"\U0001d45d": "p",
"\U0001d45e": "q",
"\U0001d45f": "r",
"\U0001d460": "s",
"\U0001d461": "t",
"\U0001d462": "u",
"\U0001d463": "v",
"\U0001d464": "w",
"\U0001d465": "x",
"\U0001d466": "y",
"\U0001d467": "z",
}
FUNC = {
"sin": "\\sin({fe})",
"cos": "\\cos({fe})",
"tan": "\\tan({fe})",
"arcsin": "\\arcsin({fe})",
"arccos": "\\arccos({fe})",
"arctan": "\\arctan({fe})",
"arccot": "\\arccot({fe})",
"sinh": "\\sinh({fe})",
"cosh": "\\cosh({fe})",
"tanh": "\\tanh({fe})",
"coth": "\\coth({fe})",
"sec": "\\sec({fe})",
"csc": "\\csc({fe})",
"mod": "\\mod {fe}",
"max": "\\max({fe})",
"min": "\\min({fe})",
}
FUNC_PLACE = "{fe}"
BRK = "\\\\"
CHR_DEFAULT = {
"ACC_VAL": "\\hat{{{0}}}",
}
POS = {
"top": "\\overline{{{0}}}", # not sure
"bot": "\\underline{{{0}}}",
}
POS_DEFAULT = {
"BAR_VAL": "\\overline{{{0}}}",
}
SUB = "_{{{0}}}"
SUP = "^{{{0}}}"
F = {
"bar": "\\frac{{{num}}}{{{den}}}",
"skw": r"^{{{num}}}/_{{{den}}}",
"noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}",
"lin": "{{{num}}}/{{{den}}}",
}
F_DEFAULT = "\\frac{{{num}}}{{{den}}}"
D = "\\left{left}{text}\\right{right}"
D_DEFAULT = {
"left": "(",
"right": ")",
"null": ".",
}
RAD = "\\sqrt[{deg}]{{{text}}}"
RAD_DEFAULT = "\\sqrt{{{text}}}"
ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}"
LIM_FUNC = {
"lim": "\\lim_{{{lim}}}",
"max": "\\max_{{{lim}}}",
"min": "\\min_{{{lim}}}",
}
LIM_TO = ("\\rightarrow", "\\to")
LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
M = "\\begin{{matrix}}{text}\\end{{matrix}}"

View File

@@ -0,0 +1,455 @@
"""
Office Math Markup Language (OMML)
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
On 23/01/2025
"""
import lxml.etree as ET
from loguru import logger
from pylatexenc.latexencode import UnicodeToLatexEncoder
from .latex_dict import (
ALN,
ARR,
BACKSLASH,
BLANK,
BRK,
CHARS,
CHR,
CHR_BO,
CHR_DEFAULT,
D_DEFAULT,
F_DEFAULT,
FUNC,
FUNC_PLACE,
LIM_FUNC,
LIM_TO,
LIM_UPP,
POS,
POS_DEFAULT,
RAD,
RAD_DEFAULT,
SUB,
SUP,
D,
F,
M,
T,
)
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
def load(stream):
tree = ET.parse(stream)
for omath in tree.findall(OMML_NS + "oMath"):
yield oMath2Latex(omath)
def load_string(string):
root = ET.fromstring(string)
for omath in root.findall(OMML_NS + "oMath"):
yield oMath2Latex(omath)
def escape_latex(strs):
last = None
new_chr = []
strs = strs.replace(r"\\", "\\")
for c in strs:
if (c in CHARS) and (last != BACKSLASH):
new_chr.append(BACKSLASH + c)
else:
new_chr.append(c)
last = c
return BLANK.join(new_chr)
def get_val(key, default=None, store=CHR):
if key is not None:
return key if not store else store.get(key, key)
else:
return default
class Tag2Method:
def call_method(self, elm, stag=None):
getmethod = self.tag2meth.get
if stag is None:
stag = elm.tag.replace(OMML_NS, "")
method = getmethod(stag)
if method:
return method(self, elm)
else:
return None
def process_children_list(self, elm, include=None):
"""
process children of the elm,return iterable
"""
for _e in list(elm):
if OMML_NS not in _e.tag:
continue
stag = _e.tag.replace(OMML_NS, "")
if include and (stag not in include):
continue
t = self.call_method(_e, stag=stag)
if t is None:
t = self.process_unknow(_e, stag)
if t is None:
continue
yield (stag, t, _e)
def process_children_dict(self, elm, include=None):
"""
process children of the elm,return dict
"""
latex_chars = dict()
for stag, t, e in self.process_children_list(elm, include):
latex_chars[stag] = t
return latex_chars
def process_children(self, elm, include=None):
"""
process children of the elm,return string
"""
return BLANK.join(
(
t if not isinstance(t, Tag2Method) else str(t)
for stag, t, e in self.process_children_list(elm, include)
)
)
def process_unknow(self, elm, stag):
return None
class Pr(Tag2Method):
text = ""
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
__innerdict = None # can't use the __dict__
""" common properties of element"""
def __init__(self, elm):
self.__innerdict = {}
self.text = self.process_children(elm)
def __str__(self):
return self.text
def __unicode__(self):
return self.__str__(self)
def __getattr__(self, name):
return self.__innerdict.get(name, None)
def do_brk(self, elm):
self.__innerdict["brk"] = BRK
return BRK
def do_common(self, elm):
stag = elm.tag.replace(OMML_NS, "")
if stag in self.__val_tags:
t = elm.get(f"{OMML_NS}val")
self.__innerdict[stag] = t
return None
tag2meth = {
"brk": do_brk,
"chr": do_common,
"pos": do_common,
"begChr": do_common,
"endChr": do_common,
"type": do_common,
}
class oMath2Latex(Tag2Method):
"""
Convert oMath element of omml to latex
"""
_t_dict = T
__direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
u = UnicodeToLatexEncoder(
replacement_latex_protection="braces-all",
unknown_char_policy="keep",
unknown_char_warning=False,
)
def __init__(self, element):
self._latex = self.process_children(element)
def __str__(self):
return self.latex.replace(" ", " ")
def __unicode__(self):
return self.__str__(self)
def process_unknow(self, elm, stag):
if stag in self.__direct_tags:
return self.process_children(elm)
elif stag[-2:] == "Pr":
return Pr(elm)
else:
return None
@property
def latex(self):
return self._latex
def do_acc(self, elm):
"""
the accent function
"""
c_dict = self.process_children_dict(elm)
latex_s = get_val(
c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
)
return latex_s.format(c_dict["e"])
def do_bar(self, elm):
"""
the bar function
"""
c_dict = self.process_children_dict(elm)
pr = c_dict["barPr"]
latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
return pr.text + latex_s.format(c_dict["e"])
def do_d(self, elm):
"""
the delimiter object
"""
c_dict = self.process_children_dict(elm)
pr = c_dict["dPr"]
null = D_DEFAULT.get("null")
s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
delim = pr.text + D.format(
left=null if not s_val else escape_latex(s_val),
text=c_dict["e"],
right=null if not e_val else escape_latex(e_val),
)
return delim
def do_spre(self, elm):
"""
the Pre-Sub-Superscript object -- Not support yet
"""
def do_sub(self, elm):
text = self.process_children(elm)
return SUB.format(text)
def do_sup(self, elm):
text = self.process_children(elm)
return SUP.format(text)
def do_f(self, elm):
"""
the fraction object
"""
c_dict = self.process_children_dict(elm)
pr = c_dict.get("fPr")
if pr is None:
# Handle missing fPr element gracefully
logger.debug("Missing fPr element in fraction, using default formatting")
latex_s = F_DEFAULT
return latex_s.format(
num=c_dict.get("num"),
den=c_dict.get("den"),
)
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
def do_func(self, elm):
"""
the Function-Apply object (Examples:sin cos)
"""
c_dict = self.process_children_dict(elm)
func_name = c_dict.get("fName")
return func_name.replace(FUNC_PLACE, c_dict.get("e"))
def do_fname(self, elm):
"""
the func name
"""
latex_chars = []
for stag, t, e in self.process_children_list(elm):
if stag == "r":
if FUNC.get(t):
latex_chars.append(FUNC[t])
else:
logger.warning("Function not supported, will default to text: %s", t)
if isinstance(t, str):
latex_chars.append(t)
elif isinstance(t, str):
latex_chars.append(t)
t = BLANK.join(latex_chars)
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
def do_groupchr(self, elm):
"""
the Group-Character object
"""
c_dict = self.process_children_dict(elm)
pr = c_dict["groupChrPr"]
latex_s = get_val(pr.chr)
return pr.text + latex_s.format(c_dict["e"])
def do_rad(self, elm):
"""
the radical object
"""
c_dict = self.process_children_dict(elm)
text = c_dict.get("e")
deg_text = c_dict.get("deg")
if deg_text:
return RAD.format(deg=deg_text, text=text)
else:
return RAD_DEFAULT.format(text=text)
def do_eqarr(self, elm):
"""
the Array object
"""
return ARR.format(
text=BRK.join(
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
)
)
def do_limlow(self, elm):
"""
the Lower-Limit object
"""
t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s:
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
else:
return latex_s.format(lim=t_dict.get("lim"))
def do_limupp(self, elm):
"""
the Upper-Limit object
"""
t_dict = self.process_children_dict(elm, include=("e", "lim"))
return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
def do_lim(self, elm):
"""
the lower limit of the limLow object and the upper limit of the limUpp function
"""
return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
def do_m(self, elm):
"""
the Matrix object
"""
rows = []
for stag, t, e in self.process_children_list(elm):
if stag == "mPr":
pass
elif stag == "mr":
rows.append(t)
return M.format(text=BRK.join(rows))
def do_mr(self, elm):
"""
a single row of the matrix m
"""
return ALN.join(
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
)
def do_nary(self, elm):
"""
the n-ary object
"""
res = []
bo = ""
for stag, t, e in self.process_children_list(elm):
if stag == "naryPr":
# if <m:naryPr> contains no <m:chr>, the n-ary represents an integral
bo = get_val(t.chr, default="\\int", store=CHR_BO)
else:
res.append(t)
return bo + BLANK.join(res)
def process_unicode(self, s):
# s = s if isinstance(s,unicode) else unicode(s,'utf-8')
# print(s, self._t_dict.get(s, s), unicode_to_latex(s))
# _str.append( self._t_dict.get(s, s) )
out_latex_str = self.u.unicode_to_latex(s)
if (
s.startswith("{") is False
and out_latex_str.startswith("{")
and s.endswith("}") is False
and out_latex_str.endswith("}")
):
out_latex_str = f" {out_latex_str[1:-1]} "
if "ensuremath" in out_latex_str:
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
out_latex_str = out_latex_str.replace("}", " ")
if out_latex_str.strip().startswith("\\text"):
out_latex_str = f" \\text{{{out_latex_str}}} "
return out_latex_str
def do_r(self, elm):
"""
Get text from 'r' element,And try convert them to latex symbols
@todo text style support , (sty)
@todo \text (latex pure text support)
"""
_str = []
_base_str = []
found_text = elm.findtext(f"./{OMML_NS}t")
if found_text:
for s in found_text:
out_latex_str = self.process_unicode(s)
_str.append(out_latex_str)
_base_str.append(s)
proc_str = escape_latex(BLANK.join(_str))
base_proc_str = BLANK.join(_base_str)
if "{" not in base_proc_str and "\\{" in proc_str:
proc_str = proc_str.replace("\\{", "{")
if "}" not in base_proc_str and "\\}" in proc_str:
proc_str = proc_str.replace("\\}", "}")
return proc_str
tag2meth = {
"acc": do_acc,
"r": do_r,
"bar": do_bar,
"sub": do_sub,
"sup": do_sup,
"f": do_f,
"func": do_func,
"fName": do_fname,
"groupChr": do_groupchr,
"d": do_d,
"rad": do_rad,
"eqArr": do_eqarr,
"limLow": do_limlow,
"limUpp": do_limupp,
"lim": do_lim,
"m": do_m,
"mr": do_mr,
"nary": do_nary,
}

View File

@@ -0,0 +1,55 @@
import xml.dom.minidom
from mammoth.docx.xmlparser import XmlText, XmlElement
from mammoth.docx.office_xml import _collapse_alternate_content, _namespaces
def parse_xml_str(xml_str, namespace_mapping=None):
if namespace_mapping is None:
namespace_prefixes = {}
else:
namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
document = xml.dom.minidom.parseString(xml_str)
def convert_node(node):
if node.nodeType == xml.dom.Node.ELEMENT_NODE:
return convert_element(node)
elif node.nodeType == xml.dom.Node.TEXT_NODE:
return XmlText(node.nodeValue)
else:
return None
def convert_element(element):
converted_name = convert_name(element)
converted_attributes = dict(
(convert_name(attribute), attribute.value)
for attribute in element.attributes.values()
if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
)
converted_children = []
for child_node in element.childNodes:
converted_child_node = convert_node(child_node)
if converted_child_node is not None:
converted_children.append(converted_child_node)
return XmlElement(converted_name, converted_attributes, converted_children)
def convert_name(node):
if node.namespaceURI is None:
return node.localName
else:
prefix = namespace_prefixes.get(node.namespaceURI)
if prefix is None:
return "{%s}%s" % (node.namespaceURI, node.localName)
else:
return "%s:%s" % (prefix, node.localName)
return convert_node(document.documentElement)
def read_str(xml_str):
i = parse_xml_str(xml_str, _namespaces)
return _collapse_alternate_content(i)[0]

View File

18
mineru/model/pptx/main.py Normal file
View File

@@ -0,0 +1,18 @@
from typing import BinaryIO
from mineru.model.pptx.pptx_converter import PptxConverter
def convert_path(file_path: str):
with open(file_path, "rb") as fh:
return convert_binary(fh)
def convert_binary(file_binary: BinaryIO):
converter = PptxConverter()
converter.convert(file_binary)
return converter.pages
if __name__ == "__main__":
print(convert_path("powerpoint_sample.pptx"))

Binary file not shown.

View File

@@ -0,0 +1,610 @@
from io import BytesIO
from typing import Final, BinaryIO, Optional
from lxml import etree
from pptx import Presentation, presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from pptx.oxml.text import CT_TextLineBreak
from loguru import logger
from PIL import Image, UnidentifiedImageError, WmfImagePlugin
from mineru.utils.enum_class import BlockType
from mineru.utils.pdf_reader import image_to_b64str
class PptxConverter:
def __init__(self):
self.namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
}
self.file_stream = None
self.pptx_obj = None
self.pages = []
self.cur_page = []
self.list_block_stack: list = [] # 列表块堆栈
def convert(
self,
file_stream: BinaryIO,
):
self.file_stream = file_stream
self.pptx_obj = Presentation(self.file_stream)
self.pages.append(self.cur_page)
if self.pptx_obj:
self._walk_linear(self.pptx_obj)
if self.pages[-1] == []:
self.pages.pop()
def _walk_linear(self, pptx_obj: presentation.Presentation):
# 遍历每一张幻灯片
for _, slide in enumerate(pptx_obj.slides):
def handle_shapes(shape):
handle_groups(shape)
if shape.has_table:
# 处理表格
self._handle_tables(shape)
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# 处理图片
if hasattr(shape, "image"):
self._handle_pictures(shape)
# 如果形状没有任何文本,则继续处理下一个形状
if not hasattr(shape, "text"):
return
if shape.text is None:
return
if len(shape.text.strip()) == 0:
return
if not shape.has_text_frame:
logger.warning("Warning: shape has text but not text_frame")
return
# 处理其他文本元素,包括列表(项目符号列表、编号列表等)
self._handle_text_elements(shape)
return
def handle_groups(shape):
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
for groupedshape in shape.shapes:
handle_shapes(groupedshape)
# 遍历幻灯片中的每一个形状
for shape in slide.shapes:
handle_shapes(shape)
self.cur_page = []
self.pages.append(self.cur_page)
def _handle_tables(self, shape):
"""将PowerPoint表格转换为HTML格式。
Args:
shape: 包含表格的形状对象。
parent_slide: 父幻灯片组。
slide_ind: 当前幻灯片索引。
doc: 文档对象(此实现中未使用)。
slide_size: 幻灯片尺寸。
Returns:
str: 表格的HTML字符串如果没有表格则返回None。
"""
if not shape.has_table:
return None
table = shape.table
table_xml = shape._element
# 开始构建HTML表格
html_parts = ['<table border="1">']
# 跟踪已被合并单元格占用的位置
# 格式: {(row, col): True}
occupied_cells = {}
for row_idx, row in enumerate(table.rows):
html_parts.append(" <tr>")
for col_idx, cell in enumerate(row.cells):
# 跳过被合并占用的单元格
if (row_idx, col_idx) in occupied_cells:
continue
# 获取单元格XML以读取跨度信息
cell_xml = table_xml.xpath(
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
)
if not cell_xml:
continue
cell_xml = cell_xml[0]
# 解析行跨度和列跨度
row_span = cell_xml.get("rowSpan")
col_span = cell_xml.get("gridSpan")
row_span = int(row_span) if row_span else 1
col_span = int(col_span) if col_span else 1
# 标记被此单元格占用的位置
for r in range(row_idx, row_idx + row_span):
for c in range(col_idx, col_idx + col_span):
if (r, c) != (row_idx, col_idx):
occupied_cells[(r, c)] = True
# 确定标签类型:第一行使用<th>,其他使用<td>
tag = "th" if row_idx == 0 else "td"
# 构建属性字符串
attrs = []
if row_span > 1:
attrs.append(f'rowspan="{row_span}"')
if col_span > 1:
attrs.append(f'colspan="{col_span}"')
attr_str = " " + " ".join(attrs) if attrs else ""
# 获取单元格文本内容
cell_text = cell.text.strip() if cell.text else ""
# 转义HTML特殊字符防止XSS
cell_text = (
cell_text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
html_parts.append(f" <{tag}{attr_str}>{cell_text}</{tag}>")
html_parts.append(" </tr>")
html_parts.append("</table>")
self.cur_page.append(
{
"type": BlockType.TABLE,
"content": "\n".join(html_parts),
}
)
return None
def _handle_pictures(self, shape):
# 使用PIL打开图像
try:
# 获取图像字节数据
image = shape.image
image_bytes = image.blob
im_dpi, _ = image.dpi
pil_image = Image.open(BytesIO(image_bytes))
if isinstance(pil_image, WmfImagePlugin.WmfStubImageFile):
logger.warning(f"Skipping WMF image, size: {pil_image.size}")
placeholder = Image.new("RGB", pil_image.size, (240, 240, 240))
img_base64 = image_to_b64str(placeholder)
else:
if pil_image.mode != "RGB":
pil_image = pil_image.convert("RGB")
img_base64 = image_to_b64str(pil_image)
image_block = {
"type": BlockType.IMAGE,
"content": img_base64,
}
self.cur_page.append(image_block)
except (UnidentifiedImageError, OSError) as e:
logger.warning(f"Warning: image cannot be loaded by Pillow: {e}")
return
def _handle_text_elements(self, shape):
is_list_group_created = False
enum_list_item_value = 0
new_list = None
# 遍历段落以构建文本
for paragraph in shape.text_frame.paragraphs:
is_a_list, bullet_type = self._is_list_item(paragraph)
p = paragraph._element
# 将换行符转换为空格并累积文本
p_text = ""
for e in p.content_children:
if isinstance(e, CT_TextLineBreak):
p_text += " "
else:
p_text += e.text
if is_a_list:
enum_marker = ""
enumerated = bullet_type == "Numbered"
if not is_list_group_created:
if enumerated:
list_attribute = "ordered"
else:
list_attribute = "unordered"
new_list_block = {
"type": BlockType.LIST,
"attribute": list_attribute,
"list_items": [],
}
self.cur_page.append(new_list_block)
self.list_block_stack.append(new_list_block)
is_list_group_created = True
enum_list_item_value = 0
if enumerated:
enum_list_item_value += 1
enum_marker = str(enum_list_item_value) + ". "
list_item = {
"type": BlockType.TEXT,
"content": enum_marker + p_text,
}
self.list_block_stack[-1]["list_items"].append(list_item)
else: # 段落不是列表项
if is_list_group_created:
is_list_group_created = False
new_list = None
enum_list_item_value = 0
self.list_block_stack.pop()
# 根据文本类型分配标签(标题/部分标题/段落等)
label = BlockType.TEXT
if shape.is_placeholder:
placeholder_type = shape.placeholder_format.type
if placeholder_type in [
PP_PLACEHOLDER.CENTER_TITLE,
PP_PLACEHOLDER.TITLE,
PP_PLACEHOLDER.SUBTITLE,
]:
label = BlockType.TITLE
# 输出累积的内联文本
self.cur_page.append(
{
"type": label,
"content": p_text,
}
)
return
def _is_list_item(self, paragraph) -> tuple[bool, str]:
"""
判断段落是否应被视为列表项。
该方法首先尝试通过拥有该段落的形状来解析列表样式信息。
如果无法做到,则回退到基于段落属性和级别的更简单检查。
Args:
paragraph: 需要检查的'python-pptx'段落对象。
Returns:
返回一个2元组(`is_list`, `bullet_type`),其中:
`is_list` - 若段落被视为列表项为True否则为False
`bullet_type` - 为以下之一:'Bullet'(项目符号)、'Numbered'(编号)或'None'
描述列表标记类型。
"""
p = paragraph._element
# 尝试从段落获取形状(包含该段落的对象),如果可能的话
shape = None
try:
# 这个路径适用于python-pptx段落对象
# 首先获取文本框架(段落的父对象)
text_frame = paragraph._parent
# 然后获取形状(文本框架的父对象)
shape = text_frame._parent
except AttributeError:
pass
if shape is not None:
marker_info = self._get_effective_list_marker(shape, paragraph)
# 检查这是否肯定是一个列表项
if marker_info["is_list"] is True or marker_info["kind"] in (
"buChar",
"buAutoNum",
"buBlip",
):
if marker_info["kind"] == "buChar":
return (True, "Bullet")
elif marker_info["kind"] == "buAutoNum":
return (True, "Numbered")
else:
return (True, "None")
# 检查这是否肯定不是列表项
if marker_info["is_list"] is False:
return (False, "None")
# 回退到段落级别检查(缩进级别大于0时视为列表项)
if paragraph.level > 0:
return (True, "None")
return (False, "None")
# 如果无法获取形状,使用更简单的检查方式
if p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]}) is not None:
return (True, "Bullet")
elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]}) is not None
):
return (True, "Numbered")
elif paragraph.level > 0:
# 很可能是子列表项(缩进表示嵌套)
return (True, "None")
else:
return (False, "None")
def _get_effective_list_marker(self, shape, paragraph) -> dict:
"""
返回描述段落的有效列表标记的字典。
列表标记信息可以来自多个来源:直接段落属性、形状级别的列表样式、
布局占位符或主幻灯片文本样式。此辅助方法解析所有这些层,并返回
有效标记的统一视图。
Args:
shape: 包含段落的形状对象。
paragraph: 需要检查的'python-pptx'段落对象。
Returns:
返回列表标记信息的字典,其中:
`is_list` - True/False/None表示这是否是列表项
`kind` - 为以下之一:`buChar`、`buAutoNum`、`buBlip`、`buNone`或None描述标记类型
`detail` - 项目符号字符或编号类型字符串或如果不适用则为None
`level` - 段落级别,范围在(0, 8)内。
"""
p = paragraph._element
lvl = self._get_paragraph_level(p)
# 1) 直接段落属性
pPr = p.find("a:pPr", namespaces=self.namespaces)
is_list, kind, detail = self._parse_bullet_from_paragraph_properties(pPr)
if is_list is not None:
return {
"is_list": is_list,
"kind": kind,
"detail": detail,
"level": lvl,
}
# 2) 形状级别的列表样式(txBody/a:lstStyle)
txBody = shape._element.find(".//p:txBody", namespaces=self.namespaces)
is_list, kind, detail = self._parse_bullet_from_text_body_list_style(
txBody, lvl
)
if is_list is not None:
return {
"is_list": is_list,
"kind": kind,
"detail": detail,
"level": lvl,
}
# 3) 布局占位符列表样式(如果这是一个占位符)
layout_result = None
if shape.is_placeholder:
idx = shape.placeholder_format.idx
layout = shape.part.slide.slide_layout
layout_ph = None
try:
layout_ph = layout.placeholders.get(idx)
except Exception:
layout_ph = None
if layout_ph is not None:
layout_tx = layout_ph._element.find(
".//p:txBody", namespaces=self.namespaces
)
is_list, kind, detail = self._parse_bullet_from_text_body_list_style(
layout_tx, lvl
)
# 仅在is_list明确为True/False时使用布局结果
if is_list is not None:
layout_result = {
"is_list": is_list,
"kind": kind,
"detail": detail,
"level": lvl,
}
# 4) 解析主文本样式
ph_type = shape.placeholder_format.type
master = shape.part.slide.slide_layout.slide_master
is_list, kind, detail = self._parse_bullet_from_master_text_styles(
master, ph_type, lvl
)
# 检查主样式是否有标记信息
if kind in ("buChar", "buAutoNum", "buBlip"):
return {
"is_list": True,
"kind": kind,
"detail": detail,
"level": lvl,
}
elif is_list is not None:
return {
"is_list": is_list,
"kind": kind,
"detail": detail,
"level": lvl,
}
# If layout has explicit is_list value but master didn't override it, use layout
# 如果布局有显式的is_list值但主样式没有覆盖它则使用布局结果
if layout_result is not None:
return layout_result
return {
"is_list": None,
"kind": None,
"detail": None,
"level": lvl,
}
def _get_paragraph_level(self, paragraph) -> int:
"""
返回段落XML元素的缩进级别。
段落可以有不同的缩进级别(0-8)。级别存储在段落属性XML元素的'lvl'属性中。
Args:
paragraph: 需要提取级别的段落XML元素。
Returns:
返回范围在(0, 8)内的段落级别。当找不到'a:pPr'元素、没有'lvl'属性
'lvl'属性值无效时返回0。
"""
pPr = paragraph.find("a:pPr", namespaces=self.namespaces)
if pPr is not None and "lvl" in pPr.attrib:
try:
return int(pPr.get("lvl"))
except ValueError:
pass
return 0
def _parse_bullet_from_paragraph_properties(
self, pPr
) -> tuple[Optional[bool], Optional[str], Optional[str]]:
"""
从段落属性节点解析项目符号或编号信息。
检查'a:pPr''a:lvlXpPr'元素,并提取关于项目符号字符、自动编号、
图片项目符号或显式'buNone'标记的信息。
Args:
pPr: 段落属性XML元素('a:pPr''a:lvlXpPr')。
Returns:
返回一个3元组(`is_list`, `kind`, `detail`),其中:
`is_list` - 为True/False/None表示这是否是列表项
`kind` - 为以下之一:`buChar`(项目符号字符)、`buAutoNum`(自动编号)、
`buBlip`(图片项目符号)、`buNone`(无标记)或None描述标记类型
`detail` - 项目符号字符、编号类型字符串或如果不适用则为None。
"""
if pPr is None:
return (None, None, None)
# 显式指定无项目符号
if pPr.find("a:buNone", namespaces=self.namespaces) is not None:
return (False, "buNone", None)
# 项目符号字符
buChar = pPr.find("a:buChar", namespaces=self.namespaces)
if buChar is not None:
return (True, "buChar", buChar.get("char"))
# 自动编号
buAuto = pPr.find("a:buAutoNum", namespaces=self.namespaces)
if buAuto is not None:
return (True, "buAutoNum", buAuto.get("type"))
# 图片项目符号
buBlip = pPr.find("a:buBlip", namespaces=self.namespaces)
if buBlip is not None:
return (True, "buBlip", "image")
return (None, None, None)
def _parse_bullet_from_text_body_list_style(
self, txBody, lvl: int
) -> tuple[Optional[bool], Optional[str], Optional[str]]:
"""
从文本体的列表样式中解析项目符号或编号信息。
'txBody'下搜索'a:lstStyle/a:lvl{lvl+1}pPr',并使用级别特定的段落属性
推断项目符号或编号信息。
Args:
txBody: 文本体XML元素'p:txBody'
lvl: 段落级别,范围在(0, 8)内。
Returns:
返回一个3元组(`is_list`, `kind`, `detail`),其中:
`is_list` - 为True/False/None表示这是否是列表项
`kind` - 为以下之一:`buChar`、`buAutoNum`、`buBlip`、`buNone`或None
`detail` - 项目符号字符、编号类型字符串或如果不适用则为None。
"""
if txBody is None:
return (None, None, None)
lstStyle = txBody.find("a:lstStyle", namespaces=self.namespaces)
lvl_pPr = self._find_level_properties_in_list_style(lstStyle, lvl)
is_list, kind, detail = self._parse_bullet_from_paragraph_properties(lvl_pPr)
return (is_list, kind, detail)
def _parse_bullet_from_master_text_styles(
self, slide_master, placeholder_type, lvl: int
) -> tuple[Optional[bool], Optional[str], Optional[str]]:
"""
从主幻灯片的文本样式中解析项目符号或编号信息。
在主幻灯片的'p:txStyles'中查找相应的样式bucket('titleStyle''bodyStyle'
'otherStyle'),并为给定的级别提取项目符号或编号信息。
Args:
slide_master: 与当前幻灯片关联的主幻灯片对象。
placeholder_type: 来自'PP_PLACEHOLDER'的占位符类型枚举。
lvl: 段落级别,范围在(0, 8)内。
Returns:
返回一个3元组(`is_list`, `kind`, `detail`),其中:
`is_list` - 为True/False/None表示这是否是列表项
`kind` - 为以下之一:`buChar`、`buAutoNum`、`buBlip`、`buNone`或None
`detail` - 项目符号字符、编号类型字符串或如果不适用则为None。
"""
style = self._get_master_text_style_node(slide_master, placeholder_type)
if style is None:
return (None, None, None)
lvl_pPr = style.find(f".//a:lvl{lvl + 1}pPr", namespaces=self.namespaces)
is_list, kind, detail = self._parse_bullet_from_paragraph_properties(lvl_pPr)
return (is_list, kind, detail)
def _find_level_properties_in_list_style(self, lstStyle, lvl: int):
"""Find the level-specific paragraph properties node from a list style.
从列表样式中查找指定级别的段落属性节点。
This looks for an `a:lvl{lvl+1}pPr` node inside an `a:lstStyle` element, where
'a:lstStyle'元素内查找'a:lvl{lvl+1}pPr'节点,其中'a:lvl1pPr'对应级别0
`a:lvl1pPr` corresponds to level 0, `a:lvl2pPr` to level 1, and so on.
'a:lvl2pPr'对应级别1依此类推。
Args:
lstStyle: List style XML element `a:lstStyle`.
lstStyle: 列表样式XML元素'a:lstStyle'
lvl: Paragraph level in the range (0, 8).
lvl: 段落级别,范围在(0, 8)内。
Returns:
Matching `a:lvl{lvl+1}pPr` XML element, or None if no matching element is
匹配的'a:lvl{lvl+1}pPr'XML元素如果未找到匹配元素则返回None。
found.
"""
if lstStyle is None:
return None
tag = f"a:lvl{lvl + 1}pPr"
return lstStyle.find(tag, namespaces=self.namespaces)
def _get_master_text_style_node(
self, slide_master, placeholder_type
) -> Optional[etree._Element]:
"""
获取占位符的相应主文本样式节点。
大多数内容占位符(BODY/OBJECT)使用'p:bodyStyle',而标题使用'p:titleStyle'
所有其他占位符默认使用'p:otherStyle'
Args:
slide_master: 与当前幻灯片关联的主幻灯片对象。
placeholder_type: 来自'PP_PLACEHOLDER'的占位符类型枚举。
Returns:
从主幻灯片的'p:txStyles'中匹配的样式节点('p:bodyStyle''p:titleStyle''p:otherStyle')或当未定义样式时返回None。
"""
txStyles = slide_master._element.find(
".//p:txStyles", namespaces=self.namespaces
)
if txStyles is None:
return None
if placeholder_type in (PP_PLACEHOLDER.BODY, PP_PLACEHOLDER.OBJECT):
return txStyles.find("p:bodyStyle", namespaces=self.namespaces)
if placeholder_type == PP_PLACEHOLDER.TITLE:
return txStyles.find("p:titleStyle", namespaces=self.namespaces)
return txStyles.find("p:otherStyle", namespaces=self.namespaces)

View File

@@ -0,0 +1,21 @@
from enum import Enum
from pydantic import BaseModel
class Script(str, Enum):
"""Text script position."""
BASELINE = "baseline"
SUB = "sub"
SUPER = "super"
class Formatting(BaseModel):
"""Formatting."""
bold: bool = False
italic: bool = False
underline: bool = False
strikethrough: bool = False
script: Script = Script.BASELINE

View File

@@ -5,6 +5,7 @@ class BlockType:
TABLE = 'table'
IMAGE_BODY = 'image_body'
TABLE_BODY = 'table_body'
CAPTION = 'caption' # word的通用caption类型
IMAGE_CAPTION = 'image_caption'
TABLE_CAPTION = 'table_caption'
IMAGE_FOOTNOTE = 'image_footnote'
@@ -12,6 +13,7 @@ class BlockType:
TEXT = 'text'
TITLE = 'title'
INTERLINE_EQUATION = 'interline_equation'
EQUATION = "equation" # 公式(独立公式)
LIST = 'list'
INDEX = 'index'
DISCARDED = 'discarded'
@@ -129,4 +131,4 @@ class NotExtractType(Enum):
IMAGE_CAPTION = BlockType.IMAGE_CAPTION
TABLE_FOOTNOTE = BlockType.TABLE_FOOTNOTE
IMAGE_FOOTNOTE = BlockType.IMAGE_FOOTNOTE
CODE_CAPTION = BlockType.CODE_CAPTION
CODE_CAPTION = BlockType.CODE_CAPTION

View File

@@ -21,7 +21,7 @@ dependencies = [
"click>=8.1.7",
"loguru>=0.7.2",
"numpy>=1.21.6",
"pdfminer.six==20250506",
"pdfminer.six==20251230",
"tqdm>=4.67.1",
"requests",
"httpx",
@@ -37,10 +37,17 @@ dependencies = [
"fast-langdetect>=0.2.3,<0.3.0",
"scikit-image>=0.25.0,<1.0.0",
"openai>=1.70.0,<3",
"beautifulsoup4>=4.13.5,<5",
"beautifulsoup4>=4.13.5,<5",
"magika>=0.6.2,<1.1.0",
"mineru-vl-utils>=0.1.19.1,<1",
"qwen-vl-utils>=0.0.14,<1",
"python-docx>=1.2.0,<2",
'pypptx-with-oxml>=1.0.3,<2',
"mammoth>=1.11.0,<2",
"pylatexenc>=2.10,<3",
"lxml>=4.0.0,<7.0.0",
"pandas>=2.3.3,<3",
"openpyxl>=3.1.5,<4",
]
[project.optional-dependencies]