mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 19:18:34 +07:00
Compare commits
80 Commits
release-2.
...
add_docx
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3124678a20 | ||
|
|
ac2099a332 | ||
|
|
7448029a9d | ||
|
|
e92dfe5c6b | ||
|
|
8d14f64369 | ||
|
|
6aae690998 | ||
|
|
9393a62d94 | ||
|
|
25e48960c7 | ||
|
|
52874072e4 | ||
|
|
00b77122df | ||
|
|
6fc0596ce9 | ||
|
|
32c08d1418 | ||
|
|
e20523018a | ||
|
|
c05e77fcdd | ||
|
|
72f26156f0 | ||
|
|
29751a415f | ||
|
|
3743463438 | ||
|
|
6725648080 | ||
|
|
efb0421fe8 | ||
|
|
777307bad9 | ||
|
|
5ee10f0306 | ||
|
|
8611da6a4b | ||
|
|
775aafb033 | ||
|
|
78e83d00a7 | ||
|
|
7bcd3afb86 | ||
|
|
517257e058 | ||
|
|
ee4065ffd5 | ||
|
|
5026faa458 | ||
|
|
8381e61f0c | ||
|
|
a8c4b6c2fe | ||
|
|
a0b0eb704c | ||
|
|
1ed570a205 | ||
|
|
21ebf6bdb1 | ||
|
|
11513dd44c | ||
|
|
32592cd27f | ||
|
|
9137f84591 | ||
|
|
56c3bb3570 | ||
|
|
23e3a73f33 | ||
|
|
e7c67a95b6 | ||
|
|
ea6bb2ede9 | ||
|
|
810717b42a | ||
|
|
7554127ff7 | ||
|
|
d629ede38a | ||
|
|
6501ad878d | ||
|
|
6c8fa9776f | ||
|
|
1d93aa8ab9 | ||
|
|
9aba297545 | ||
|
|
dec84a9b5a | ||
|
|
cbe39f4a5a | ||
|
|
e042384953 | ||
|
|
a644a8a074 | ||
|
|
07db6839b8 | ||
|
|
17394682e2 | ||
|
|
97bd2a2b94 | ||
|
|
ad175df3d2 | ||
|
|
0cbe965d97 | ||
|
|
74f6d4d0e7 | ||
|
|
648fb1f7cf | ||
|
|
b6fc07cf9e | ||
|
|
57be6926a9 | ||
|
|
7abcfa39a0 | ||
|
|
6f76664141 | ||
|
|
23bc263b85 | ||
|
|
53fb1cd055 | ||
|
|
f0ce905c7d | ||
|
|
df33d483de | ||
|
|
f44fb174ea | ||
|
|
70b1e73606 | ||
|
|
11fb0a0199 | ||
|
|
66f8f0e93a | ||
|
|
942c1693c7 | ||
|
|
7387797b17 | ||
|
|
8a3eb268a9 | ||
|
|
fcaa34f466 | ||
|
|
f111a97d5d | ||
|
|
6d8106685e | ||
|
|
4e29979d89 | ||
|
|
569baff305 | ||
|
|
09920c6391 | ||
|
|
bc02653d63 |
119
demo/demo.py
119
demo/demo.py
@@ -6,7 +6,8 @@ from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
|
||||
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn, pptx_suffixes, \
|
||||
xlsx_suffixes, pdf_suffixes, image_suffixes, office_suffixes, docx_suffixes
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
|
||||
from mineru.utils.engine_utils import get_vlm_engine
|
||||
@@ -17,7 +18,9 @@ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as
|
||||
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
|
||||
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
||||
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
||||
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
||||
from mineru.backend.office.office_middle_json_mkcontent import union_make as office_union_make
|
||||
from mineru.backend.office.docx_analyze import office_docx_analyze
|
||||
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path, guess_suffix_by_bytes
|
||||
|
||||
|
||||
def do_parse(
|
||||
@@ -41,6 +44,24 @@ def do_parse(
|
||||
start_page_id=0, # Start page ID for parsing, default is 0
|
||||
end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
|
||||
):
|
||||
need_remove_index = _process_office_doc(
|
||||
output_dir,
|
||||
pdf_file_names=pdf_file_names,
|
||||
pdf_bytes_list=pdf_bytes_list,
|
||||
f_dump_md=f_dump_md,
|
||||
f_dump_middle_json=f_dump_middle_json,
|
||||
f_dump_model_output=f_dump_model_output,
|
||||
f_dump_orig_file=f_dump_orig_pdf,
|
||||
f_dump_content_list=f_dump_content_list,
|
||||
f_make_md_mode=f_make_md_mode,
|
||||
)
|
||||
for index in sorted(need_remove_index, reverse=True):
|
||||
del pdf_bytes_list[index]
|
||||
del pdf_file_names[index]
|
||||
del p_lang_list[index]
|
||||
if not pdf_bytes_list:
|
||||
logger.warning("No valid PDF or image files to process.")
|
||||
return
|
||||
|
||||
if backend == "pipeline":
|
||||
for idx, pdf_bytes in enumerate(pdf_bytes_list):
|
||||
@@ -68,7 +89,7 @@ def do_parse(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, model_json, is_pipeline=True
|
||||
f_make_md_mode, middle_json, model_json, process_mode="pipeline"
|
||||
)
|
||||
else:
|
||||
f_draw_span_bbox = False
|
||||
@@ -93,7 +114,7 @@ def do_parse(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
||||
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
|
||||
)
|
||||
elif backend.startswith("hybrid-"):
|
||||
backend = backend[7:]
|
||||
@@ -123,9 +144,56 @@ def do_parse(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
||||
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
|
||||
)
|
||||
|
||||
|
||||
def _process_office_doc(
|
||||
output_dir,
|
||||
pdf_file_names: list[str],
|
||||
pdf_bytes_list: list[bytes],
|
||||
f_dump_md=True,
|
||||
f_dump_middle_json=True,
|
||||
f_dump_model_output=True,
|
||||
f_dump_orig_file=True,
|
||||
f_dump_content_list=True,
|
||||
f_make_md_mode=MakeMode.MM_MD,
|
||||
):
|
||||
need_remove_index = []
|
||||
for i, file_bytes in enumerate(pdf_bytes_list):
|
||||
pdf_file_name = pdf_file_names[i]
|
||||
file_suffix = guess_suffix_by_bytes(file_bytes)
|
||||
if file_suffix in docx_suffixes:
|
||||
|
||||
need_remove_index.append(i)
|
||||
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"office")
|
||||
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
||||
middle_json, infer_result = office_docx_analyze(
|
||||
file_bytes,
|
||||
image_writer=image_writer,
|
||||
)
|
||||
|
||||
f_draw_layout_bbox = False
|
||||
f_draw_span_bbox = False
|
||||
pdf_info = middle_json["pdf_info"]
|
||||
|
||||
_process_output(
|
||||
pdf_info, file_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_file,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, infer_result, process_mode="docx"
|
||||
)
|
||||
elif file_suffix in pptx_suffixes:
|
||||
need_remove_index.append(i)
|
||||
logger.warning(f"Currently, PPTX files are not supported: {pdf_file_name}")
|
||||
elif file_suffix in xlsx_suffixes:
|
||||
need_remove_index.append(i)
|
||||
logger.warning(f"Currently, XLSX files are not supported: {pdf_file_name}")
|
||||
|
||||
return need_remove_index
|
||||
|
||||
|
||||
def _process_output(
|
||||
pdf_info,
|
||||
pdf_bytes,
|
||||
@@ -143,8 +211,18 @@ def _process_output(
|
||||
f_make_md_mode,
|
||||
middle_json,
|
||||
model_output=None,
|
||||
is_pipeline=True
|
||||
process_mode="vlm"
|
||||
):
|
||||
|
||||
if process_mode == "pipeline":
|
||||
make_func = pipeline_union_make
|
||||
elif process_mode == "vlm":
|
||||
make_func = vlm_union_make
|
||||
elif process_mode in office_suffixes:
|
||||
make_func = office_union_make
|
||||
else:
|
||||
raise Exception(f"Unknown process_mode: {process_mode}")
|
||||
|
||||
"""处理输出文件"""
|
||||
if f_draw_layout_bbox:
|
||||
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
|
||||
@@ -153,15 +231,20 @@ def _process_output(
|
||||
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
|
||||
|
||||
if f_dump_orig_pdf:
|
||||
md_writer.write(
|
||||
f"{pdf_file_name}_origin.pdf",
|
||||
pdf_bytes,
|
||||
)
|
||||
if process_mode in ["pipeline", "vlm"]:
|
||||
md_writer.write(
|
||||
f"{pdf_file_name}_origin.pdf",
|
||||
pdf_bytes,
|
||||
)
|
||||
elif process_mode in office_suffixes:
|
||||
md_writer.write(
|
||||
f"{pdf_file_name}_origin.{process_mode}",
|
||||
pdf_bytes,
|
||||
)
|
||||
|
||||
image_dir = str(os.path.basename(local_image_dir))
|
||||
|
||||
if f_dump_md:
|
||||
make_func = pipeline_union_make if is_pipeline else vlm_union_make
|
||||
md_content_str = make_func(pdf_info, f_make_md_mode, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}.md",
|
||||
@@ -169,13 +252,19 @@ def _process_output(
|
||||
)
|
||||
|
||||
if f_dump_content_list:
|
||||
make_func = pipeline_union_make if is_pipeline else vlm_union_make
|
||||
content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_content_list.json",
|
||||
json.dumps(content_list, ensure_ascii=False, indent=4),
|
||||
)
|
||||
|
||||
if process_mode != "pipeline":
|
||||
content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_content_list_v2.json",
|
||||
json.dumps(content_list_v2, ensure_ascii=False, indent=4),
|
||||
)
|
||||
|
||||
if f_dump_middle_json:
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_middle.json",
|
||||
@@ -254,14 +343,12 @@ def parse_doc(
|
||||
if __name__ == '__main__':
|
||||
# args
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
pdf_files_dir = os.path.join(__dir__, "pdfs")
|
||||
pdf_files_dir = os.path.join(__dir__, "docx")
|
||||
output_dir = os.path.join(__dir__, "output")
|
||||
pdf_suffixes = ["pdf"]
|
||||
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]
|
||||
|
||||
doc_path_list = []
|
||||
for doc_path in Path(pdf_files_dir).glob('*'):
|
||||
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
|
||||
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes + office_suffixes:
|
||||
doc_path_list.append(doc_path)
|
||||
|
||||
"""如果您由于网络问题无法下载模型,可以设置环境变量MINERU_MODEL_SOURCE为modelscope使用免代理仓库下载模型"""
|
||||
|
||||
1
mineru/backend/office/__init__.py
Normal file
1
mineru/backend/office/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
43
mineru/backend/office/docx_analyze.py
Normal file
43
mineru/backend/office/docx_analyze.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
import time
|
||||
from io import BytesIO
|
||||
|
||||
from loguru import logger
|
||||
from mineru.backend.office.model_output_to_middle_json import result_to_middle_json
|
||||
|
||||
from mineru.model.docx.main import convert_binary
|
||||
|
||||
|
||||
def office_docx_analyze(
|
||||
file_bytes,
|
||||
image_writer=None
|
||||
):
|
||||
infer_start = time.time()
|
||||
|
||||
file_stream = BytesIO(file_bytes)
|
||||
results = convert_binary(file_stream)
|
||||
|
||||
infer_time = round(time.time() - infer_start, 2)
|
||||
safe_time = max(infer_time, 0.01)
|
||||
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results) / safe_time, 3)} page/s")
|
||||
|
||||
middle_json = result_to_middle_json(
|
||||
results,
|
||||
image_writer,
|
||||
)
|
||||
|
||||
return middle_json, results
|
||||
|
||||
if __name__ == '__main__':
|
||||
docx_path = "/Users/myhloli/projects/20240809magic_pdf/Magic-PDF/mineru/model/docx/test.docx"
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
with open(docx_path, 'rb') as f:
|
||||
file_bytes = f.read()
|
||||
image_writer = FileBasedDataWriter("./output_images")
|
||||
middle_json, results = office_docx_analyze(
|
||||
file_bytes,
|
||||
image_writer=image_writer,
|
||||
)
|
||||
|
||||
import json
|
||||
print(json.dumps(middle_json, indent=2, ensure_ascii=False))
|
||||
48
mineru/backend/office/model_output_to_middle_json.py
Normal file
48
mineru/backend/office/model_output_to_middle_json.py
Normal file
@@ -0,0 +1,48 @@
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from mineru.backend.office.office_magic_model import MagicModel
|
||||
from mineru.version import __version__
|
||||
|
||||
|
||||
def blocks_to_page_info(page_blocks, image_writer, page_index) -> dict:
|
||||
"""将blocks转换为页面信息"""
|
||||
|
||||
magic_model = MagicModel(page_blocks)
|
||||
image_blocks = magic_model.get_image_blocks()
|
||||
table_blocks = magic_model.get_table_blocks()
|
||||
title_blocks = magic_model.get_title_blocks()
|
||||
discarded_blocks = magic_model.get_discarded_blocks()
|
||||
code_blocks = magic_model.get_code_blocks()
|
||||
ref_text_blocks = magic_model.get_ref_text_blocks()
|
||||
phonetic_blocks = magic_model.get_phonetic_blocks()
|
||||
list_blocks = magic_model.get_list_blocks()
|
||||
text_blocks = magic_model.get_text_blocks()
|
||||
interline_equation_blocks = magic_model.get_interline_equation_blocks()
|
||||
|
||||
page_blocks = []
|
||||
page_blocks.extend([
|
||||
*image_blocks,
|
||||
*table_blocks,
|
||||
*code_blocks,
|
||||
*ref_text_blocks,
|
||||
*phonetic_blocks,
|
||||
*title_blocks,
|
||||
*text_blocks,
|
||||
*interline_equation_blocks,
|
||||
*list_blocks,
|
||||
])
|
||||
# 对page_blocks根据index的值进行排序
|
||||
page_blocks.sort(key=lambda x: x["index"])
|
||||
|
||||
page_info = {"para_blocks": page_blocks, "discarded_blocks": discarded_blocks, "page_idx": page_index}
|
||||
return page_info
|
||||
|
||||
|
||||
def result_to_middle_json(model_output_blocks_list, image_writer):
|
||||
middle_json = {"pdf_info": [], "_backend":"office", "_version_name": __version__}
|
||||
for index, page_blocks in enumerate(model_output_blocks_list):
|
||||
page_info = blocks_to_page_info(page_blocks, image_writer, index)
|
||||
middle_json["pdf_info"].append(page_info)
|
||||
|
||||
return middle_json
|
||||
639
mineru/backend/office/office_magic_model.py
Normal file
639
mineru/backend/office/office_magic_model.py
Normal file
@@ -0,0 +1,639 @@
|
||||
import re
|
||||
from typing import Literal
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
||||
from mineru.utils.enum_class import ContentType, BlockType
|
||||
from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
|
||||
|
||||
|
||||
class MagicModel:
|
||||
def __init__(self, page_blocks: list):
|
||||
self.page_blocks = page_blocks
|
||||
|
||||
blocks = []
|
||||
self.all_spans = []
|
||||
|
||||
# 对caption块进行分类,将其分类为image_caption或table_caption
|
||||
page_blocks = classify_caption_blocks(page_blocks)
|
||||
|
||||
# 解析每个块
|
||||
for index, block_info in enumerate(page_blocks):
|
||||
|
||||
block_type = block_info["type"]
|
||||
span_type = "unknown"
|
||||
|
||||
if block_type in [
|
||||
"text",
|
||||
"title",
|
||||
"image_caption",
|
||||
"table_caption",
|
||||
"header",
|
||||
"footer",
|
||||
]:
|
||||
span_type = ContentType.TEXT
|
||||
elif block_type in ["image"]:
|
||||
block_type = BlockType.IMAGE_BODY
|
||||
span_type = ContentType.IMAGE
|
||||
elif block_type in ["table"]:
|
||||
block_type = BlockType.TABLE_BODY
|
||||
span_type = ContentType.TABLE
|
||||
elif block_type in ["equation"]:
|
||||
block_type = BlockType.INTERLINE_EQUATION
|
||||
span_type = ContentType.INTERLINE_EQUATION
|
||||
|
||||
if span_type in ["image", "table"]:
|
||||
span = {
|
||||
"type": span_type,
|
||||
}
|
||||
if span_type == ContentType.TABLE:
|
||||
span["html"] = clean_table_html(block_info.get("content", ""))
|
||||
elif span_type == ContentType.IMAGE:
|
||||
# jpg格式base64
|
||||
span["image_base64"] = block_info.get("content", "")
|
||||
elif span_type in [ContentType.INTERLINE_EQUATION]:
|
||||
span = {
|
||||
"type": span_type,
|
||||
"content": block_info.get("content", ""),
|
||||
}
|
||||
else:
|
||||
|
||||
if block_content:
|
||||
block_content = clean_content(block_content)
|
||||
|
||||
if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
|
||||
|
||||
switch_code_to_algorithm = True
|
||||
|
||||
# 生成包含文本和公式的span列表
|
||||
spans = []
|
||||
last_end = 0
|
||||
|
||||
# 查找所有公式
|
||||
for match in re.finditer(r'\\\((.+?)\\\)', block_content):
|
||||
start, end = match.span()
|
||||
|
||||
# 添加公式前的文本
|
||||
if start > last_end:
|
||||
text_before = block_content[last_end:start]
|
||||
if text_before.strip():
|
||||
spans.append({
|
||||
"bbox": block_bbox,
|
||||
"type": ContentType.TEXT,
|
||||
"content": text_before
|
||||
})
|
||||
|
||||
# 添加公式(去除\(和\))
|
||||
formula = match.group(1)
|
||||
spans.append({
|
||||
"bbox": block_bbox,
|
||||
"type": ContentType.INLINE_EQUATION,
|
||||
"content": formula.strip()
|
||||
})
|
||||
|
||||
last_end = end
|
||||
|
||||
# 添加最后一个公式后的文本
|
||||
if last_end < len(block_content):
|
||||
text_after = block_content[last_end:]
|
||||
if text_after.strip():
|
||||
spans.append({
|
||||
"bbox": block_bbox,
|
||||
"type": ContentType.TEXT,
|
||||
"content": text_after
|
||||
})
|
||||
|
||||
span = spans
|
||||
else:
|
||||
span = {
|
||||
"bbox": block_bbox,
|
||||
"type": span_type,
|
||||
"content": block_content,
|
||||
}
|
||||
|
||||
# 处理span类型并添加到all_spans
|
||||
if isinstance(span, dict) and "bbox" in span:
|
||||
self.all_spans.append(span)
|
||||
spans = [span]
|
||||
elif isinstance(span, list):
|
||||
self.all_spans.extend(span)
|
||||
spans = span
|
||||
else:
|
||||
raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
|
||||
|
||||
|
||||
blocks.append(
|
||||
{
|
||||
"bbox": block_bbox,
|
||||
"type": block_type,
|
||||
"lines": [line],
|
||||
"index": index,
|
||||
}
|
||||
)
|
||||
|
||||
self.image_blocks = []
|
||||
self.table_blocks = []
|
||||
self.interline_equation_blocks = []
|
||||
self.text_blocks = []
|
||||
self.title_blocks = []
|
||||
self.code_blocks = []
|
||||
self.discarded_blocks = []
|
||||
self.ref_text_blocks = []
|
||||
self.phonetic_blocks = []
|
||||
self.list_blocks = []
|
||||
for block in blocks:
|
||||
if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
|
||||
self.image_blocks.append(block)
|
||||
elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
|
||||
self.table_blocks.append(block)
|
||||
elif block["type"] in [BlockType.CODE_BODY, BlockType.CODE_CAPTION]:
|
||||
self.code_blocks.append(block)
|
||||
elif block["type"] == BlockType.INTERLINE_EQUATION:
|
||||
self.interline_equation_blocks.append(block)
|
||||
elif block["type"] == BlockType.TEXT:
|
||||
self.text_blocks.append(block)
|
||||
elif block["type"] == BlockType.TITLE:
|
||||
self.title_blocks.append(block)
|
||||
elif block["type"] in [BlockType.REF_TEXT]:
|
||||
self.ref_text_blocks.append(block)
|
||||
elif block["type"] in [BlockType.PHONETIC]:
|
||||
self.phonetic_blocks.append(block)
|
||||
elif block["type"] in [BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
|
||||
self.discarded_blocks.append(block)
|
||||
elif block["type"] == BlockType.LIST:
|
||||
self.list_blocks.append(block)
|
||||
else:
|
||||
continue
|
||||
|
||||
self.list_blocks, self.text_blocks, self.ref_text_blocks = fix_list_blocks(self.list_blocks, self.text_blocks, self.ref_text_blocks)
|
||||
self.image_blocks, not_include_image_blocks = fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
|
||||
self.table_blocks, not_include_table_blocks = fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
|
||||
self.code_blocks, not_include_code_blocks = fix_two_layer_blocks(self.code_blocks, BlockType.CODE)
|
||||
for code_block in self.code_blocks:
|
||||
for block in code_block['blocks']:
|
||||
if block['type'] == BlockType.CODE_BODY:
|
||||
if len(block["lines"]) > 0:
|
||||
line = block["lines"][0]
|
||||
code_block["sub_type"] = line["extra"]["type"]
|
||||
if code_block["sub_type"] in ["code"]:
|
||||
code_block["guess_lang"] = line["extra"]["guess_lang"]
|
||||
del line["extra"]
|
||||
else:
|
||||
code_block["sub_type"] = "code"
|
||||
code_block["guess_lang"] = "txt"
|
||||
|
||||
for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
|
||||
block["type"] = BlockType.TEXT
|
||||
self.text_blocks.append(block)
|
||||
|
||||
|
||||
def get_list_blocks(self):
|
||||
return self.list_blocks
|
||||
|
||||
def get_image_blocks(self):
|
||||
return self.image_blocks
|
||||
|
||||
def get_table_blocks(self):
|
||||
return self.table_blocks
|
||||
|
||||
def get_code_blocks(self):
|
||||
return self.code_blocks
|
||||
|
||||
def get_ref_text_blocks(self):
|
||||
return self.ref_text_blocks
|
||||
|
||||
def get_phonetic_blocks(self):
|
||||
return self.phonetic_blocks
|
||||
|
||||
def get_title_blocks(self):
|
||||
return self.title_blocks
|
||||
|
||||
def get_text_blocks(self):
|
||||
return self.text_blocks
|
||||
|
||||
def get_interline_equation_blocks(self):
|
||||
return self.interline_equation_blocks
|
||||
|
||||
def get_discarded_blocks(self):
|
||||
return self.discarded_blocks
|
||||
|
||||
def get_all_spans(self):
|
||||
return self.all_spans
|
||||
|
||||
|
||||
def clean_table_html(html: str) -> str:
|
||||
"""
|
||||
清洗表格HTML,只保留对表格结构表示有用的信息。
|
||||
|
||||
保留的属性:
|
||||
- colspan: 列合并
|
||||
- rowspan: 行合并
|
||||
|
||||
清洗的内容:
|
||||
- 移除所有style属性
|
||||
- 移除所有class属性
|
||||
- 移除border等其他属性
|
||||
- 保持表格结构标签(table, thead, tbody, tr, th, td等)
|
||||
|
||||
Args:
|
||||
html: 原始表格HTML字符串
|
||||
|
||||
Returns:
|
||||
清洗后的HTML字符串
|
||||
"""
|
||||
if not html:
|
||||
return ""
|
||||
|
||||
# 需要保留的属性(对表格结构有用)
|
||||
preserved_attrs = {'colspan', 'rowspan'}
|
||||
|
||||
def clean_tag(match):
|
||||
"""清洗单个标签,只保留结构相关的属性"""
|
||||
full_tag = match.group(0)
|
||||
tag_name = match.group(1).lower()
|
||||
|
||||
# 自闭合标签的处理
|
||||
is_self_closing = full_tag.rstrip().endswith('/>')
|
||||
|
||||
# 提取需要保留的属性
|
||||
kept_attrs = []
|
||||
|
||||
# 匹配所有属性: attr="value" 或 attr='value' 或 attr=value 或单独的attr
|
||||
attr_pattern = r'(\w+)\s*=\s*(?:"([^"]*)"|\'([^\']*)\'|(\S+))|(\w+)(?=\s|>|/>)'
|
||||
for attr_match in re.finditer(attr_pattern, full_tag):
|
||||
if attr_match.group(5):
|
||||
# 单独的属性(如 disabled),跳过
|
||||
continue
|
||||
|
||||
attr_name = attr_match.group(1)
|
||||
if attr_name is None:
|
||||
continue
|
||||
attr_name = attr_name.lower()
|
||||
attr_value = attr_match.group(2) or attr_match.group(3) or attr_match.group(4) or ""
|
||||
|
||||
# 只保留colspan和rowspan
|
||||
if attr_name in preserved_attrs:
|
||||
kept_attrs.append(f'{attr_name}="{attr_value}"')
|
||||
|
||||
# 重建标签
|
||||
if kept_attrs:
|
||||
attrs_str = ' ' + ' '.join(kept_attrs)
|
||||
else:
|
||||
attrs_str = ''
|
||||
|
||||
if is_self_closing:
|
||||
return f'<{tag_name}{attrs_str}/>'
|
||||
else:
|
||||
return f'<{tag_name}{attrs_str}>'
|
||||
|
||||
# 匹配开始标签(包括自闭合标签),捕获标签名
|
||||
# 匹配 <tagname ...> 或 <tagname .../>
|
||||
tag_pattern = r'<(\w+)(?:\s+[^>]*)?\s*/?>'
|
||||
|
||||
result = re.sub(tag_pattern, clean_tag, html)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def isolated_formula_clean(txt):
|
||||
latex = txt[:]
|
||||
if latex.startswith("\\["): latex = latex[2:]
|
||||
if latex.endswith("\\]"): latex = latex[:-2]
|
||||
latex = latex.strip()
|
||||
return latex
|
||||
|
||||
|
||||
def code_content_clean(content):
|
||||
"""清理代码内容,移除Markdown代码块的开始和结束标记"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
lines = content.splitlines()
|
||||
start_idx = 0
|
||||
end_idx = len(lines)
|
||||
|
||||
# 处理开头的三个反引号
|
||||
if lines and lines[0].startswith("```"):
|
||||
start_idx = 1
|
||||
|
||||
# 处理结尾的三个反引号
|
||||
if lines and end_idx > start_idx and lines[end_idx - 1].strip() == "```":
|
||||
end_idx -= 1
|
||||
|
||||
# 只有在有内容时才进行join操作
|
||||
if start_idx < end_idx:
|
||||
return "\n".join(lines[start_idx:end_idx]).strip()
|
||||
return ""
|
||||
|
||||
|
||||
def clean_content(content):
|
||||
if content and content.count("\\[") == content.count("\\]") and content.count("\\[") > 0:
|
||||
# Function to handle each match
|
||||
def replace_pattern(match):
|
||||
# Extract content between \[ and \]
|
||||
inner_content = match.group(1)
|
||||
return f"[{inner_content}]"
|
||||
|
||||
# Find all patterns of \[x\] and apply replacement
|
||||
pattern = r'\\\[(.*?)\\\]'
|
||||
content = re.sub(pattern, replace_pattern, content)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
|
||||
"""基于index的主客体关联包装函数"""
|
||||
# 定义获取主体和客体对象的函数
|
||||
def get_subjects():
|
||||
return reduct_overlap(
|
||||
list(
|
||||
map(
|
||||
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
|
||||
filter(
|
||||
lambda x: x["type"] == subject_block_type,
|
||||
blocks,
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def get_objects():
|
||||
return reduct_overlap(
|
||||
list(
|
||||
map(
|
||||
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
|
||||
filter(
|
||||
lambda x: x["type"] == object_block_type,
|
||||
blocks,
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# 调用通用方法
|
||||
return tie_up_category_by_index(
|
||||
get_subjects,
|
||||
get_objects
|
||||
)
|
||||
|
||||
|
||||
def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
|
||||
with_captions = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_caption")
|
||||
with_footnotes = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_footnote")
|
||||
ret = []
|
||||
for v in with_captions:
|
||||
record = {
|
||||
f"{block_type}_body": v["sub_bbox"],
|
||||
f"{block_type}_caption_list": v["obj_bboxes"],
|
||||
}
|
||||
filter_idx = v["sub_idx"]
|
||||
d = next(filter(lambda x: x["sub_idx"] == filter_idx, with_footnotes))
|
||||
record[f"{block_type}_footnote_list"] = d["obj_bboxes"]
|
||||
ret.append(record)
|
||||
return ret
|
||||
|
||||
|
||||
def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
|
||||
need_fix_blocks = get_type_blocks(blocks, fix_type)
|
||||
fixed_blocks = []
|
||||
not_include_blocks = []
|
||||
processed_indices = set()
|
||||
|
||||
# 特殊处理表格类型,确保标题在表格前,注脚在表格后
|
||||
if fix_type in ["table", "image"]:
|
||||
# 收集所有不合适的caption和footnote
|
||||
misplaced_captions = [] # 存储(caption, 原始block索引)
|
||||
misplaced_footnotes = [] # 存储(footnote, 原始block索引)
|
||||
|
||||
# 第一步:移除不符合位置要求的footnote
|
||||
for block_idx, block in enumerate(need_fix_blocks):
|
||||
body = block[f"{fix_type}_body"]
|
||||
body_index = body["index"]
|
||||
|
||||
# 检查footnote应在body后或同位置
|
||||
valid_footnotes = []
|
||||
for footnote in block[f"{fix_type}_footnote_list"]:
|
||||
if footnote["index"] >= body_index:
|
||||
valid_footnotes.append(footnote)
|
||||
else:
|
||||
misplaced_footnotes.append((footnote, block_idx))
|
||||
block[f"{fix_type}_footnote_list"] = valid_footnotes
|
||||
|
||||
# 第三步:重新分配不合规的footnote到合适的body
|
||||
for footnote, original_block_idx in misplaced_footnotes:
|
||||
footnote_index = footnote["index"]
|
||||
best_block_idx = None
|
||||
min_distance = float('inf')
|
||||
|
||||
# 寻找索引小于等于footnote_index的最近body
|
||||
for idx, block in enumerate(need_fix_blocks):
|
||||
body_index = block[f"{fix_type}_body"]["index"]
|
||||
if body_index <= footnote_index and idx != original_block_idx:
|
||||
distance = footnote_index - body_index
|
||||
if distance < min_distance:
|
||||
min_distance = distance
|
||||
best_block_idx = idx
|
||||
|
||||
if best_block_idx is not None:
|
||||
# 找到合适的body,添加到对应block的footnote_list
|
||||
need_fix_blocks[best_block_idx][f"{fix_type}_footnote_list"].append(footnote)
|
||||
else:
|
||||
# 没找到合适的body,作为普通block处理
|
||||
not_include_blocks.append(footnote)
|
||||
|
||||
# 第四步:将每个block的caption_list和footnote_list中不连续index的元素提出来作为普通block处理
|
||||
for block in need_fix_blocks:
|
||||
caption_list = block[f"{fix_type}_caption_list"]
|
||||
footnote_list = block[f"{fix_type}_footnote_list"]
|
||||
body_index = block[f"{fix_type}_body"]["index"]
|
||||
|
||||
# 处理caption_list (从body往前看,caption在body之前)
|
||||
if caption_list:
|
||||
# 按index降序排列,从最接近body的开始检查
|
||||
caption_list.sort(key=lambda x: x["index"], reverse=True)
|
||||
filtered_captions = [caption_list[0]]
|
||||
for i in range(1, len(caption_list)):
|
||||
prev_index = caption_list[i - 1]["index"]
|
||||
curr_index = caption_list[i]["index"]
|
||||
|
||||
# 检查是否连续
|
||||
if curr_index == prev_index - 1:
|
||||
filtered_captions.append(caption_list[i])
|
||||
else:
|
||||
# 检查gap中是否只有body_index
|
||||
gap_indices = set(range(curr_index + 1, prev_index))
|
||||
if gap_indices == {body_index}:
|
||||
# gap中只有body_index,不算真正的gap
|
||||
filtered_captions.append(caption_list[i])
|
||||
else:
|
||||
# 出现真正的gap,后续所有caption都作为普通block
|
||||
not_include_blocks.extend(caption_list[i:])
|
||||
break
|
||||
# 恢复升序
|
||||
filtered_captions.reverse()
|
||||
block[f"{fix_type}_caption_list"] = filtered_captions
|
||||
|
||||
# 处理footnote_list (从body往后看,footnote在body之后)
|
||||
if footnote_list:
|
||||
# 按index升序排列,从最接近body的开始检查
|
||||
footnote_list.sort(key=lambda x: x["index"])
|
||||
filtered_footnotes = [footnote_list[0]]
|
||||
for i in range(1, len(footnote_list)):
|
||||
# 检查是否与前一个footnote连续
|
||||
if footnote_list[i]["index"] == footnote_list[i - 1]["index"] + 1:
|
||||
filtered_footnotes.append(footnote_list[i])
|
||||
else:
|
||||
# 出现gap,后续所有footnote都作为普通block
|
||||
not_include_blocks.extend(footnote_list[i:])
|
||||
break
|
||||
block[f"{fix_type}_footnote_list"] = filtered_footnotes
|
||||
|
||||
# 构建两层结构blocks
|
||||
for block in need_fix_blocks:
|
||||
body = block[f"{fix_type}_body"]
|
||||
caption_list = block[f"{fix_type}_caption_list"]
|
||||
footnote_list = block[f"{fix_type}_footnote_list"]
|
||||
|
||||
body["type"] = f"{fix_type}_body"
|
||||
for caption in caption_list:
|
||||
caption["type"] = f"{fix_type}_caption"
|
||||
processed_indices.add(caption["index"])
|
||||
for footnote in footnote_list:
|
||||
footnote["type"] = f"{fix_type}_footnote"
|
||||
processed_indices.add(footnote["index"])
|
||||
|
||||
processed_indices.add(body["index"])
|
||||
|
||||
two_layer_block = {
|
||||
"type": fix_type,
|
||||
"bbox": body["bbox"],
|
||||
"blocks": [body],
|
||||
"index": body["index"],
|
||||
}
|
||||
two_layer_block["blocks"].extend([*caption_list, *footnote_list])
|
||||
# 对blocks按index排序
|
||||
two_layer_block["blocks"].sort(key=lambda x: x["index"])
|
||||
|
||||
fixed_blocks.append(two_layer_block)
|
||||
|
||||
# 添加未处理的blocks
|
||||
for block in blocks:
|
||||
block.pop("type", None)
|
||||
if block["index"] not in processed_indices and block not in not_include_blocks:
|
||||
not_include_blocks.append(block)
|
||||
|
||||
return fixed_blocks, not_include_blocks
|
||||
|
||||
|
||||
def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
|
||||
for list_block in list_blocks:
|
||||
list_block["blocks"] = []
|
||||
if "lines" in list_block:
|
||||
del list_block["lines"]
|
||||
|
||||
temp_text_blocks = text_blocks + ref_text_blocks
|
||||
need_remove_blocks = []
|
||||
for block in temp_text_blocks:
|
||||
for list_block in list_blocks:
|
||||
if calculate_overlap_area_in_bbox1_area_ratio(block["bbox"], list_block["bbox"]) >= 0.8:
|
||||
list_block["blocks"].append(block)
|
||||
need_remove_blocks.append(block)
|
||||
break
|
||||
|
||||
for block in need_remove_blocks:
|
||||
if block in text_blocks:
|
||||
text_blocks.remove(block)
|
||||
elif block in ref_text_blocks:
|
||||
ref_text_blocks.remove(block)
|
||||
|
||||
# 移除blocks为空的list_block
|
||||
list_blocks = [lb for lb in list_blocks if lb["blocks"]]
|
||||
|
||||
for list_block in list_blocks:
|
||||
# 统计list_block["blocks"]中所有block的type,用众数作为list_block的sub_type
|
||||
type_count = {}
|
||||
for sub_block in list_block["blocks"]:
|
||||
sub_block_type = sub_block["type"]
|
||||
if sub_block_type not in type_count:
|
||||
type_count[sub_block_type] = 0
|
||||
type_count[sub_block_type] += 1
|
||||
|
||||
if type_count:
|
||||
list_block["sub_type"] = max(type_count, key=type_count.get)
|
||||
else:
|
||||
list_block["sub_type"] = "unknown"
|
||||
|
||||
return list_blocks, text_blocks, ref_text_blocks
|
||||
|
||||
|
||||
def classify_caption_blocks(page_blocks: list) -> list:
|
||||
"""
|
||||
对page_blocks中的caption块进行分类,将其分类为image_caption或table_caption。
|
||||
|
||||
规则:
|
||||
1. 只有与type为table或image相邻的caption可以作为caption
|
||||
2. caption块与table或image中相隔的块全部是caption的情况视为该caption块与table或image相邻
|
||||
3. caption的类型与他前置位相邻的母块type一致(table或image),如果没有前置位母块则检查是否有后置位母块
|
||||
4. 没有相邻母块的caption需要变更type为text
|
||||
"""
|
||||
if not page_blocks:
|
||||
return page_blocks
|
||||
|
||||
available_types = ["table", "image"]
|
||||
|
||||
result_blocks = []
|
||||
n = len(page_blocks)
|
||||
|
||||
for i, block in enumerate(page_blocks):
|
||||
if block.get("type") != "caption":
|
||||
result_blocks.append(block)
|
||||
continue
|
||||
|
||||
# 查找前置位相邻的母块(table或image)
|
||||
# 向前查找,跳过连续的caption块
|
||||
prev_parent_type = None
|
||||
j = i - 1
|
||||
while j >= 0:
|
||||
prev_block_type = page_blocks[j].get("type")
|
||||
if prev_block_type in available_types:
|
||||
prev_parent_type = prev_block_type
|
||||
break
|
||||
elif prev_block_type == "caption":
|
||||
# 继续向前查找
|
||||
j -= 1
|
||||
else:
|
||||
# 遇到非caption且非table/image的块,停止查找
|
||||
break
|
||||
|
||||
# 查找后置位相邻的母块(table或image)
|
||||
# 向后查找,跳过连续的caption块
|
||||
next_parent_type = None
|
||||
k = i + 1
|
||||
while k < n:
|
||||
next_block_type = page_blocks[k].get("type")
|
||||
if next_block_type in available_types:
|
||||
next_parent_type = next_block_type
|
||||
break
|
||||
elif next_block_type == "caption":
|
||||
# 继续向后查找
|
||||
k += 1
|
||||
else:
|
||||
# 遇到非caption且非table/image的块,停止查找
|
||||
break
|
||||
|
||||
# 根据规则确定caption类型
|
||||
new_block = block.copy()
|
||||
if prev_parent_type:
|
||||
# 优先使用前置位母块的类型
|
||||
new_block["type"] = f"{prev_parent_type}_caption"
|
||||
elif next_parent_type:
|
||||
# 没有前置位母块,使用后置位母块的类型
|
||||
new_block["type"] = f"{next_parent_type}_caption"
|
||||
else:
|
||||
# 没有相邻母块,变更为text
|
||||
new_block["type"] = "text"
|
||||
|
||||
result_blocks.append(new_block)
|
||||
|
||||
return result_blocks
|
||||
658
mineru/backend/office/office_middle_json_mkcontent.py
Normal file
658
mineru/backend/office/office_middle_json_mkcontent.py
Normal file
@@ -0,0 +1,658 @@
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from mineru.utils.char_utils import full_to_half_exclude_marks, is_hyphen_at_line_end
|
||||
from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
|
||||
from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
|
||||
from mineru.utils.language import detect_lang
|
||||
|
||||
latex_delimiters_config = get_latex_delimiter_config()
|
||||
|
||||
default_delimiters = {
|
||||
'display': {'left': '$$', 'right': '$$'},
|
||||
'inline': {'left': '$', 'right': '$'}
|
||||
}
|
||||
|
||||
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
|
||||
|
||||
display_left_delimiter = delimiters['display']['left']
|
||||
display_right_delimiter = delimiters['display']['right']
|
||||
inline_left_delimiter = delimiters['inline']['left']
|
||||
inline_right_delimiter = delimiters['inline']['right']
|
||||
|
||||
|
||||
def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
|
||||
block_text = ''
|
||||
for line in para_block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] in [ContentType.TEXT]:
|
||||
span['content'] = full_to_half_exclude_marks(span['content'])
|
||||
block_text += span['content']
|
||||
block_lang = detect_lang(block_text)
|
||||
|
||||
para_text = ''
|
||||
for i, line in enumerate(para_block['lines']):
|
||||
for j, span in enumerate(line['spans']):
|
||||
span_type = span['type']
|
||||
content = ''
|
||||
if span_type == ContentType.TEXT:
|
||||
content = span['content']
|
||||
elif span_type == ContentType.INLINE_EQUATION:
|
||||
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
|
||||
elif span_type == ContentType.INTERLINE_EQUATION:
|
||||
if formula_enable:
|
||||
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
|
||||
else:
|
||||
if span.get('image_path', ''):
|
||||
content = f""
|
||||
|
||||
content = content.strip()
|
||||
if content:
|
||||
|
||||
if span_type == ContentType.INTERLINE_EQUATION:
|
||||
para_text += content
|
||||
continue
|
||||
|
||||
# 定义CJK语言集合(中日韩)
|
||||
cjk_langs = {'zh', 'ja', 'ko'}
|
||||
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
||||
|
||||
# 判断是否为行末span
|
||||
is_last_span = j == len(line['spans']) - 1
|
||||
|
||||
if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
|
||||
if is_last_span and span_type != ContentType.INLINE_EQUATION:
|
||||
para_text += content
|
||||
else:
|
||||
para_text += f'{content} '
|
||||
else:
|
||||
# 西方文本语境下 每行的最后一个span判断是否要去除连字符
|
||||
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
|
||||
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
||||
if (
|
||||
is_last_span
|
||||
and span_type == ContentType.TEXT
|
||||
and is_hyphen_at_line_end(content)
|
||||
):
|
||||
# 如果下一行的第一个span是小写字母开头,删除连字符
|
||||
if (
|
||||
i+1 < len(para_block['lines'])
|
||||
and para_block['lines'][i + 1].get('spans')
|
||||
and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
|
||||
and para_block['lines'][i + 1]['spans'][0].get('content', '')
|
||||
and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
|
||||
):
|
||||
para_text += content[:-1]
|
||||
else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
|
||||
para_text += content
|
||||
else: # 西方文本语境下 content间需要空格分隔
|
||||
para_text += f'{content} '
|
||||
return para_text
|
||||
|
||||
|
||||
def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
|
||||
page_markdown = []
|
||||
for para_block in para_blocks:
|
||||
para_text = ''
|
||||
para_type = para_block['type']
|
||||
if para_type in [BlockType.TEXT, BlockType.INTERLINE_EQUATION, BlockType.PHONETIC, BlockType.REF_TEXT]:
|
||||
para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path)
|
||||
elif para_type == BlockType.LIST:
|
||||
for block in para_block['blocks']:
|
||||
item_text = merge_para_with_text(block, formula_enable=formula_enable, img_buket_path=img_buket_path)
|
||||
para_text += f"{item_text} \n"
|
||||
elif para_type == BlockType.TITLE:
|
||||
title_level = get_title_level(para_block)
|
||||
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
|
||||
elif para_type == BlockType.IMAGE:
|
||||
if make_mode == MakeMode.NLP_MD:
|
||||
continue
|
||||
elif make_mode == MakeMode.MM_MD:
|
||||
# 检测是否存在图片脚注
|
||||
has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
|
||||
# 如果存在图片脚注,则将图片脚注拼接到图片正文后面
|
||||
if has_image_footnote:
|
||||
for block in para_block['blocks']: # 1st.拼image_caption
|
||||
if block['type'] == BlockType.IMAGE_CAPTION:
|
||||
para_text += merge_para_with_text(block) + ' \n'
|
||||
for block in para_block['blocks']: # 2nd.拼image_body
|
||||
if block['type'] == BlockType.IMAGE_BODY:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] == ContentType.IMAGE:
|
||||
if span.get('image_path', ''):
|
||||
para_text += f""
|
||||
for block in para_block['blocks']: # 3rd.拼image_footnote
|
||||
if block['type'] == BlockType.IMAGE_FOOTNOTE:
|
||||
para_text += ' \n' + merge_para_with_text(block)
|
||||
else:
|
||||
for block in para_block['blocks']: # 1st.拼image_body
|
||||
if block['type'] == BlockType.IMAGE_BODY:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] == ContentType.IMAGE:
|
||||
if span.get('image_path', ''):
|
||||
para_text += f""
|
||||
for block in para_block['blocks']: # 2nd.拼image_caption
|
||||
if block['type'] == BlockType.IMAGE_CAPTION:
|
||||
para_text += ' \n' + merge_para_with_text(block)
|
||||
|
||||
elif para_type == BlockType.TABLE:
|
||||
if make_mode == MakeMode.NLP_MD:
|
||||
continue
|
||||
elif make_mode == MakeMode.MM_MD:
|
||||
for block in para_block['blocks']: # 1st.拼table_caption
|
||||
if block['type'] == BlockType.TABLE_CAPTION:
|
||||
para_text += merge_para_with_text(block) + ' \n'
|
||||
for block in para_block['blocks']: # 2nd.拼table_body
|
||||
if block['type'] == BlockType.TABLE_BODY:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] == ContentType.TABLE:
|
||||
# if processed by table model
|
||||
if table_enable:
|
||||
if span.get('html', ''):
|
||||
para_text += f"\n{span['html']}\n"
|
||||
elif span.get('image_path', ''):
|
||||
para_text += f""
|
||||
else:
|
||||
if span.get('image_path', ''):
|
||||
para_text += f""
|
||||
for block in para_block['blocks']: # 3rd.拼table_footnote
|
||||
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
||||
para_text += '\n' + merge_para_with_text(block) + ' '
|
||||
elif para_type == BlockType.CODE:
|
||||
sub_type = para_block["sub_type"]
|
||||
for block in para_block['blocks']: # 1st.拼code_caption
|
||||
if block['type'] == BlockType.CODE_CAPTION:
|
||||
para_text += merge_para_with_text(block) + ' \n'
|
||||
for block in para_block['blocks']: # 2nd.拼code_body
|
||||
if block['type'] == BlockType.CODE_BODY:
|
||||
if sub_type == BlockType.CODE:
|
||||
guess_lang = para_block["guess_lang"]
|
||||
para_text += f"```{guess_lang}\n{merge_para_with_text(block)}\n```"
|
||||
elif sub_type == BlockType.ALGORITHM:
|
||||
para_text += merge_para_with_text(block)
|
||||
|
||||
if para_text.strip() == '':
|
||||
continue
|
||||
else:
|
||||
# page_markdown.append(para_text.strip() + ' ')
|
||||
page_markdown.append(para_text.strip())
|
||||
|
||||
return page_markdown
|
||||
|
||||
|
||||
def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
|
||||
para_type = para_block['type']
|
||||
para_content = {}
|
||||
if para_type in [
|
||||
BlockType.TEXT,
|
||||
BlockType.REF_TEXT,
|
||||
BlockType.PHONETIC,
|
||||
BlockType.HEADER,
|
||||
BlockType.FOOTER,
|
||||
BlockType.PAGE_NUMBER,
|
||||
BlockType.ASIDE_TEXT,
|
||||
BlockType.PAGE_FOOTNOTE,
|
||||
]:
|
||||
para_content = {
|
||||
'type': para_type,
|
||||
'text': merge_para_with_text(para_block),
|
||||
}
|
||||
elif para_type == BlockType.LIST:
|
||||
para_content = {
|
||||
'type': para_type,
|
||||
'sub_type': para_block.get('sub_type', ''),
|
||||
'list_items':[],
|
||||
}
|
||||
for block in para_block['blocks']:
|
||||
item_text = merge_para_with_text(block)
|
||||
if item_text.strip():
|
||||
para_content['list_items'].append(item_text)
|
||||
elif para_type == BlockType.TITLE:
|
||||
title_level = get_title_level(para_block)
|
||||
para_content = {
|
||||
'type': ContentType.TEXT,
|
||||
'text': merge_para_with_text(para_block),
|
||||
}
|
||||
if title_level != 0:
|
||||
para_content['text_level'] = title_level
|
||||
elif para_type == BlockType.INTERLINE_EQUATION:
|
||||
para_content = {
|
||||
'type': ContentType.EQUATION,
|
||||
'text': merge_para_with_text(para_block),
|
||||
'text_format': 'latex',
|
||||
}
|
||||
elif para_type == BlockType.IMAGE:
|
||||
para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
|
||||
for block in para_block['blocks']:
|
||||
if block['type'] == BlockType.IMAGE_BODY:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] == ContentType.IMAGE:
|
||||
if span.get('image_path', ''):
|
||||
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
|
||||
if block['type'] == BlockType.IMAGE_CAPTION:
|
||||
para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
|
||||
if block['type'] == BlockType.IMAGE_FOOTNOTE:
|
||||
para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
|
||||
elif para_type == BlockType.TABLE:
|
||||
para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
|
||||
for block in para_block['blocks']:
|
||||
if block['type'] == BlockType.TABLE_BODY:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] == ContentType.TABLE:
|
||||
|
||||
if span.get('html', ''):
|
||||
para_content[BlockType.TABLE_BODY] = f"{span['html']}"
|
||||
|
||||
if span.get('image_path', ''):
|
||||
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
|
||||
|
||||
if block['type'] == BlockType.TABLE_CAPTION:
|
||||
para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
|
||||
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
||||
para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
|
||||
elif para_type == BlockType.CODE:
|
||||
para_content = {'type': BlockType.CODE, 'sub_type': para_block["sub_type"], BlockType.CODE_CAPTION: []}
|
||||
for block in para_block['blocks']:
|
||||
if block['type'] == BlockType.CODE_BODY:
|
||||
para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
|
||||
if para_block["sub_type"] == BlockType.CODE:
|
||||
para_content["guess_lang"] = para_block["guess_lang"]
|
||||
if block['type'] == BlockType.CODE_CAPTION:
|
||||
para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))
|
||||
|
||||
page_width, page_height = page_size
|
||||
para_bbox = para_block.get('bbox')
|
||||
if para_bbox:
|
||||
x0, y0, x1, y1 = para_bbox
|
||||
para_content['bbox'] = [
|
||||
int(x0 * 1000 / page_width),
|
||||
int(y0 * 1000 / page_height),
|
||||
int(x1 * 1000 / page_width),
|
||||
int(y1 * 1000 / page_height),
|
||||
]
|
||||
|
||||
para_content['page_idx'] = page_idx
|
||||
|
||||
return para_content
|
||||
|
||||
|
||||
def make_blocks_to_content_list_v2(para_block, img_buket_path, page_size):
|
||||
para_type = para_block['type']
|
||||
para_content = {}
|
||||
if para_type in [
|
||||
BlockType.HEADER,
|
||||
BlockType.FOOTER,
|
||||
BlockType.ASIDE_TEXT,
|
||||
BlockType.PAGE_NUMBER,
|
||||
BlockType.PAGE_FOOTNOTE,
|
||||
]:
|
||||
if para_type == BlockType.HEADER:
|
||||
content_type = ContentTypeV2.PAGE_HEADER
|
||||
elif para_type == BlockType.FOOTER:
|
||||
content_type = ContentTypeV2.PAGE_FOOTER
|
||||
elif para_type == BlockType.ASIDE_TEXT:
|
||||
content_type = ContentTypeV2.PAGE_ASIDE_TEXT
|
||||
elif para_type == BlockType.PAGE_NUMBER:
|
||||
content_type = ContentTypeV2.PAGE_NUMBER
|
||||
elif para_type == BlockType.PAGE_FOOTNOTE:
|
||||
content_type = ContentTypeV2.PAGE_FOOTNOTE
|
||||
else:
|
||||
raise ValueError(f"Unknown para_type: {para_type}")
|
||||
para_content = {
|
||||
'type': content_type,
|
||||
'content': {
|
||||
f"{content_type}_content": merge_para_with_text_v2(para_block),
|
||||
}
|
||||
}
|
||||
elif para_type == BlockType.TITLE:
|
||||
title_level = get_title_level(para_block)
|
||||
if title_level != 0:
|
||||
para_content = {
|
||||
'type': ContentTypeV2.TITLE,
|
||||
'content': {
|
||||
"title_content": merge_para_with_text_v2(para_block),
|
||||
"level": title_level
|
||||
}
|
||||
}
|
||||
else:
|
||||
para_content = {
|
||||
'type': ContentTypeV2.PARAGRAPH,
|
||||
'content': {
|
||||
"paragraph_content": merge_para_with_text_v2(para_block),
|
||||
}
|
||||
}
|
||||
elif para_type in [
|
||||
BlockType.TEXT,
|
||||
BlockType.PHONETIC
|
||||
]:
|
||||
para_content = {
|
||||
'type': ContentTypeV2.PARAGRAPH,
|
||||
'content': {
|
||||
'paragraph_content': merge_para_with_text_v2(para_block),
|
||||
}
|
||||
}
|
||||
elif para_type == BlockType.INTERLINE_EQUATION:
|
||||
image_path, math_content = get_body_data(para_block)
|
||||
para_content = {
|
||||
'type': ContentTypeV2.EQUATION_INTERLINE,
|
||||
'content': {
|
||||
'math_content': math_content,
|
||||
'math_type': 'latex',
|
||||
'image_source': {'path': f"{img_buket_path}/{image_path}"},
|
||||
}
|
||||
}
|
||||
elif para_type == BlockType.IMAGE:
|
||||
image_caption = []
|
||||
image_footnote = []
|
||||
image_path, _ = get_body_data(para_block)
|
||||
image_source = {
|
||||
'path': f"{img_buket_path}/{image_path}",
|
||||
}
|
||||
for block in para_block['blocks']:
|
||||
if block['type'] == BlockType.IMAGE_CAPTION:
|
||||
image_caption.extend(merge_para_with_text_v2(block))
|
||||
if block['type'] == BlockType.IMAGE_FOOTNOTE:
|
||||
image_footnote.extend(merge_para_with_text_v2(block))
|
||||
para_content = {
|
||||
'type': ContentTypeV2.IMAGE,
|
||||
'content': {
|
||||
'image_source': image_source,
|
||||
'image_caption': image_caption,
|
||||
'image_footnote': image_footnote,
|
||||
}
|
||||
}
|
||||
elif para_type == BlockType.TABLE:
|
||||
table_caption = []
|
||||
table_footnote = []
|
||||
image_path, html = get_body_data(para_block)
|
||||
image_source = {
|
||||
'path': f"{img_buket_path}/{image_path}",
|
||||
}
|
||||
if html.count("<table") > 1:
|
||||
table_nest_level = 2
|
||||
else:
|
||||
table_nest_level = 1
|
||||
if (
|
||||
"colspan" in html or
|
||||
"rowspan" in html or
|
||||
table_nest_level > 1
|
||||
):
|
||||
table_type = ContentTypeV2.TABLE_COMPLEX
|
||||
else:
|
||||
table_type = ContentTypeV2.TABLE_SIMPLE
|
||||
|
||||
for block in para_block['blocks']:
|
||||
if block['type'] == BlockType.TABLE_CAPTION:
|
||||
table_caption.extend(merge_para_with_text_v2(block))
|
||||
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
||||
table_footnote.extend(merge_para_with_text_v2(block))
|
||||
para_content = {
|
||||
'type': ContentTypeV2.TABLE,
|
||||
'content': {
|
||||
'image_source': image_source,
|
||||
'table_caption': table_caption,
|
||||
'table_footnote': table_footnote,
|
||||
'html': html,
|
||||
'table_type': table_type,
|
||||
'table_nest_level': table_nest_level,
|
||||
}
|
||||
}
|
||||
elif para_type == BlockType.CODE:
|
||||
code_caption = []
|
||||
code_content = []
|
||||
for block in para_block['blocks']:
|
||||
if block['type'] == BlockType.CODE_CAPTION:
|
||||
code_caption.extend(merge_para_with_text_v2(block))
|
||||
if block['type'] == BlockType.CODE_BODY:
|
||||
code_content = merge_para_with_text_v2(block)
|
||||
sub_type = para_block["sub_type"]
|
||||
if sub_type == BlockType.CODE:
|
||||
para_content = {
|
||||
'type': ContentTypeV2.CODE,
|
||||
'content': {
|
||||
'code_caption': code_caption,
|
||||
'code_content': code_content,
|
||||
'code_language': para_block.get('guess_lang', 'txt'),
|
||||
}
|
||||
}
|
||||
elif sub_type == BlockType.ALGORITHM:
|
||||
para_content = {
|
||||
'type': ContentTypeV2.ALGORITHM,
|
||||
'content': {
|
||||
'algorithm_caption': code_caption,
|
||||
'algorithm_content': code_content,
|
||||
}
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unknown code sub_type: {sub_type}")
|
||||
elif para_type == BlockType.REF_TEXT:
|
||||
para_content = {
|
||||
'type': ContentTypeV2.LIST,
|
||||
'content': {
|
||||
'list_type': ContentTypeV2.LIST_REF,
|
||||
'list_items': [
|
||||
{
|
||||
'item_type': 'text',
|
||||
'item_content': merge_para_with_text_v2(para_block),
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
elif para_type == BlockType.LIST:
|
||||
if 'sub_type' in para_block:
|
||||
if para_block['sub_type'] == BlockType.REF_TEXT:
|
||||
list_type = ContentTypeV2.LIST_REF
|
||||
elif para_block['sub_type'] == BlockType.TEXT:
|
||||
list_type = ContentTypeV2.LIST_TEXT
|
||||
else:
|
||||
raise ValueError(f"Unknown list sub_type: {para_block['sub_type']}")
|
||||
else:
|
||||
list_type = ContentTypeV2.LIST_TEXT
|
||||
list_items = []
|
||||
for block in para_block['blocks']:
|
||||
item_content = merge_para_with_text_v2(block)
|
||||
if item_content:
|
||||
list_items.append({
|
||||
'item_type': 'text',
|
||||
'item_content': item_content,
|
||||
})
|
||||
para_content = {
|
||||
'type': ContentTypeV2.LIST,
|
||||
'content': {
|
||||
'list_type': list_type,
|
||||
'list_items': list_items,
|
||||
}
|
||||
}
|
||||
|
||||
page_width, page_height = page_size
|
||||
para_bbox = para_block.get('bbox')
|
||||
if para_bbox:
|
||||
x0, y0, x1, y1 = para_bbox
|
||||
para_content['bbox'] = [
|
||||
int(x0 * 1000 / page_width),
|
||||
int(y0 * 1000 / page_height),
|
||||
int(x1 * 1000 / page_width),
|
||||
int(y1 * 1000 / page_height),
|
||||
]
|
||||
|
||||
return para_content
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def get_body_data(para_block):
|
||||
"""
|
||||
Extract image_path and html from para_block
|
||||
Returns:
|
||||
- For IMAGE/INTERLINE_EQUATION: (image_path, '')
|
||||
- For TABLE: (image_path, html)
|
||||
- Default: ('', '')
|
||||
"""
|
||||
|
||||
def get_data_from_spans(lines):
|
||||
for line in lines:
|
||||
for span in line.get('spans', []):
|
||||
span_type = span.get('type')
|
||||
if span_type == ContentType.TABLE:
|
||||
return span.get('image_path', ''), span.get('html', '')
|
||||
elif span_type == ContentType.IMAGE:
|
||||
return span.get('image_path', ''), ''
|
||||
elif span_type == ContentType.INTERLINE_EQUATION:
|
||||
return span.get('image_path', ''), span.get('content', '')
|
||||
elif span_type == ContentType.TEXT:
|
||||
return '', span.get('content', '')
|
||||
return '', ''
|
||||
|
||||
# 处理嵌套的 blocks 结构
|
||||
if 'blocks' in para_block:
|
||||
for block in para_block['blocks']:
|
||||
block_type = block.get('type')
|
||||
if block_type in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.CODE_BODY]:
|
||||
result = get_data_from_spans(block.get('lines', []))
|
||||
if result != ('', ''):
|
||||
return result
|
||||
return '', ''
|
||||
|
||||
# 处理直接包含 lines 的结构
|
||||
return get_data_from_spans(para_block.get('lines', []))
|
||||
|
||||
|
||||
def merge_para_with_text_v2(para_block):
|
||||
block_text = ''
|
||||
for line in para_block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] in [ContentType.TEXT]:
|
||||
span['content'] = full_to_half_exclude_marks(span['content'])
|
||||
block_text += span['content']
|
||||
block_lang = detect_lang(block_text)
|
||||
|
||||
para_content = []
|
||||
para_type = para_block['type']
|
||||
for i, line in enumerate(para_block['lines']):
|
||||
for j, span in enumerate(line['spans']):
|
||||
span_type = span['type']
|
||||
if span.get("content", '').strip():
|
||||
if span_type == ContentType.TEXT:
|
||||
if para_type == BlockType.PHONETIC:
|
||||
span_type = ContentTypeV2.SPAN_PHONETIC
|
||||
else:
|
||||
span_type = ContentTypeV2.SPAN_TEXT
|
||||
if span_type == ContentType.INLINE_EQUATION:
|
||||
span_type = ContentTypeV2.SPAN_EQUATION_INLINE
|
||||
if span_type in [
|
||||
ContentTypeV2.SPAN_TEXT,
|
||||
]:
|
||||
# 定义CJK语言集合(中日韩)
|
||||
cjk_langs = {'zh', 'ja', 'ko'}
|
||||
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
||||
|
||||
# 判断是否为行末span
|
||||
is_last_span = j == len(line['spans']) - 1
|
||||
|
||||
if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
|
||||
if is_last_span:
|
||||
span_content = span['content']
|
||||
else:
|
||||
span_content = f"{span['content']} "
|
||||
else:
|
||||
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
||||
if (
|
||||
is_last_span
|
||||
and is_hyphen_at_line_end(span['content'])
|
||||
):
|
||||
# 如果下一行的第一个span是小写字母开头,删除连字符
|
||||
if (
|
||||
i + 1 < len(para_block['lines'])
|
||||
and para_block['lines'][i + 1].get('spans')
|
||||
and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
|
||||
and para_block['lines'][i + 1]['spans'][0].get('content', '')
|
||||
and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
|
||||
):
|
||||
span_content = span['content'][:-1]
|
||||
else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
|
||||
span_content = span['content']
|
||||
else:
|
||||
# 西方文本语境下content间需要空格分隔
|
||||
span_content = f"{span['content']} "
|
||||
|
||||
if para_content and para_content[-1]['type'] == span_type:
|
||||
# 合并相同类型的span
|
||||
para_content[-1]['content'] += span_content
|
||||
else:
|
||||
span_content = {
|
||||
'type': span_type,
|
||||
'content': span_content,
|
||||
}
|
||||
para_content.append(span_content)
|
||||
|
||||
elif span_type in [
|
||||
ContentTypeV2.SPAN_PHONETIC,
|
||||
ContentTypeV2.SPAN_EQUATION_INLINE,
|
||||
]:
|
||||
span_content = {
|
||||
'type': span_type,
|
||||
'content': span['content'],
|
||||
}
|
||||
para_content.append(span_content)
|
||||
else:
|
||||
logger.warning(f"Unknown span type in merge_para_with_text_v2: {span_type}")
|
||||
return para_content
|
||||
|
||||
|
||||
def union_make(pdf_info_dict: list,
|
||||
make_mode: str,
|
||||
img_buket_path: str = '',
|
||||
):
|
||||
|
||||
formula_enable = get_formula_enable(os.getenv('MINERU_VLM_FORMULA_ENABLE', 'True').lower() == 'true')
|
||||
table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
|
||||
|
||||
output_content = []
|
||||
for page_info in pdf_info_dict:
|
||||
paras_of_layout = page_info.get('para_blocks')
|
||||
paras_of_discarded = page_info.get('discarded_blocks')
|
||||
page_idx = page_info.get('page_idx')
|
||||
page_size = page_info.get('page_size')
|
||||
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
||||
if not paras_of_layout:
|
||||
continue
|
||||
page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
|
||||
output_content.extend(page_markdown)
|
||||
elif make_mode == MakeMode.CONTENT_LIST:
|
||||
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
|
||||
if not para_blocks:
|
||||
continue
|
||||
for para_block in para_blocks:
|
||||
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
||||
output_content.append(para_content)
|
||||
elif make_mode == MakeMode.CONTENT_LIST_V2:
|
||||
# https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
|
||||
para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
|
||||
page_contents = []
|
||||
if para_blocks:
|
||||
for para_block in para_blocks:
|
||||
para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
|
||||
page_contents.append(para_content)
|
||||
output_content.append(page_contents)
|
||||
|
||||
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
||||
return '\n\n'.join(output_content)
|
||||
elif make_mode in [MakeMode.CONTENT_LIST, MakeMode.CONTENT_LIST_V2]:
|
||||
return output_content
|
||||
return None
|
||||
|
||||
|
||||
def get_title_level(block):
|
||||
title_level = block.get('level', 1)
|
||||
if title_level > 4:
|
||||
title_level = 4
|
||||
elif title_level < 1:
|
||||
title_level = 0
|
||||
return title_level
|
||||
@@ -15,7 +15,7 @@ from mineru.utils.config_reader import get_device
|
||||
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
||||
from mineru.utils.model_utils import get_vram
|
||||
from ..version import __version__
|
||||
from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
||||
from .common import do_parse, read_fn, pdf_suffixes, image_suffixes, office_suffixes
|
||||
|
||||
|
||||
@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
|
||||
@@ -213,7 +213,7 @@ def main(
|
||||
if os.path.isdir(input_path):
|
||||
doc_path_list = []
|
||||
for doc_path in Path(input_path).glob('*'):
|
||||
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
|
||||
if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes + office_suffixes:
|
||||
doc_path_list.append(doc_path)
|
||||
parse_doc(doc_path_list)
|
||||
else:
|
||||
|
||||
@@ -15,10 +15,12 @@ from mineru.utils.enum_class import MakeMode
|
||||
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
|
||||
from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
||||
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
||||
from mineru.backend.office.office_middle_json_mkcontent import union_make as office_union_make
|
||||
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
||||
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
||||
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
||||
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
||||
from mineru.backend.office.docx_analyze import office_docx_analyze
|
||||
from mineru.utils.pdf_page_id import get_end_page_id
|
||||
|
||||
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
||||
@@ -28,6 +30,10 @@ if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
||||
|
||||
pdf_suffixes = ["pdf"]
|
||||
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
|
||||
docx_suffixes = ["docx"]
|
||||
pptx_suffixes = ["pptx"]
|
||||
xlsx_suffixes = ["xlsx"]
|
||||
office_suffixes = docx_suffixes + pptx_suffixes + xlsx_suffixes
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
@@ -39,7 +45,7 @@ def read_fn(path):
|
||||
file_suffix = guess_suffix_by_bytes(file_bytes, path)
|
||||
if file_suffix in image_suffixes:
|
||||
return images_bytes_to_pdf_bytes(file_bytes)
|
||||
elif file_suffix in pdf_suffixes:
|
||||
elif file_suffix in pdf_suffixes + office_suffixes:
|
||||
return file_bytes
|
||||
else:
|
||||
raise Exception(f"Unknown file suffix: {file_suffix}")
|
||||
@@ -110,10 +116,18 @@ def _process_output(
|
||||
f_make_md_mode,
|
||||
middle_json,
|
||||
model_output=None,
|
||||
is_pipeline=True
|
||||
process_mode="vlm",
|
||||
):
|
||||
f_draw_line_sort_bbox = False
|
||||
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
|
||||
if process_mode == "pipeline":
|
||||
make_func = pipeline_union_make
|
||||
elif process_mode == "vlm":
|
||||
make_func = vlm_union_make
|
||||
elif process_mode in office_suffixes:
|
||||
make_func = office_union_make
|
||||
else:
|
||||
raise Exception(f"Unknown process_mode: {process_mode}")
|
||||
"""处理输出文件"""
|
||||
if f_draw_layout_bbox:
|
||||
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
|
||||
@@ -122,10 +136,16 @@ def _process_output(
|
||||
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
|
||||
|
||||
if f_dump_orig_pdf:
|
||||
md_writer.write(
|
||||
f"{pdf_file_name}_origin.pdf",
|
||||
pdf_bytes,
|
||||
)
|
||||
if process_mode in ["pipeline", "vlm"]:
|
||||
md_writer.write(
|
||||
f"{pdf_file_name}_origin.pdf",
|
||||
pdf_bytes,
|
||||
)
|
||||
elif process_mode in office_suffixes:
|
||||
md_writer.write(
|
||||
f"{pdf_file_name}_origin.{process_mode}",
|
||||
pdf_bytes,
|
||||
)
|
||||
|
||||
if f_draw_line_sort_bbox:
|
||||
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_line_sort.pdf")
|
||||
@@ -133,7 +153,6 @@ def _process_output(
|
||||
image_dir = str(os.path.basename(local_image_dir))
|
||||
|
||||
if f_dump_md:
|
||||
make_func = pipeline_union_make if is_pipeline else vlm_union_make
|
||||
md_content_str = make_func(pdf_info, f_make_md_mode, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}.md",
|
||||
@@ -141,13 +160,12 @@ def _process_output(
|
||||
)
|
||||
|
||||
if f_dump_content_list:
|
||||
make_func = pipeline_union_make if is_pipeline else vlm_union_make
|
||||
content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_content_list.json",
|
||||
json.dumps(content_list, ensure_ascii=False, indent=4),
|
||||
)
|
||||
if not is_pipeline:
|
||||
if process_mode != "pipeline":
|
||||
content_list_v2 = make_func(pdf_info, MakeMode.CONTENT_LIST_V2, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{pdf_file_name}_content_list_v2.json",
|
||||
@@ -221,7 +239,7 @@ def _process_pipeline(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, model_json, is_pipeline=True
|
||||
f_make_md_mode, middle_json, model_json, process_mode="pipeline"
|
||||
)
|
||||
|
||||
|
||||
@@ -262,7 +280,7 @@ async def _async_process_vlm(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
||||
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
|
||||
)
|
||||
|
||||
|
||||
@@ -303,7 +321,7 @@ def _process_vlm(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
||||
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
|
||||
)
|
||||
|
||||
|
||||
@@ -355,7 +373,7 @@ def _process_hybrid(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
||||
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
|
||||
)
|
||||
|
||||
|
||||
@@ -408,10 +426,56 @@ async def _async_process_hybrid(
|
||||
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
||||
f_make_md_mode, middle_json, infer_result, process_mode="vlm"
|
||||
)
|
||||
|
||||
|
||||
def _process_office_doc(
|
||||
output_dir,
|
||||
pdf_file_names: list[str],
|
||||
pdf_bytes_list: list[bytes],
|
||||
f_dump_md=True,
|
||||
f_dump_middle_json=True,
|
||||
f_dump_orig_file=True,
|
||||
f_dump_content_list=True,
|
||||
f_make_md_mode=MakeMode.MM_MD,
|
||||
):
|
||||
need_remove_index = []
|
||||
for i, file_bytes in enumerate(pdf_bytes_list):
|
||||
pdf_file_name = pdf_file_names[i]
|
||||
file_suffix = guess_suffix_by_bytes(file_bytes)
|
||||
if file_suffix in docx_suffixes:
|
||||
|
||||
need_remove_index.append(i)
|
||||
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"office")
|
||||
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
||||
middle_json, infer_result = office_docx_analyze(
|
||||
file_bytes,
|
||||
image_writer=image_writer,
|
||||
)
|
||||
|
||||
f_dump_model_output = False
|
||||
f_draw_layout_bbox = False
|
||||
f_draw_span_bbox = False
|
||||
pdf_info = middle_json["pdf_info"]
|
||||
|
||||
_process_output(
|
||||
pdf_info, file_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
||||
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_file,
|
||||
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
||||
f_make_md_mode, middle_json, infer_result, process_mode="docx"
|
||||
)
|
||||
elif file_suffix in pptx_suffixes:
|
||||
need_remove_index.append(i)
|
||||
logger.warning(f"Currently, PPTX files are not supported: {pdf_file_name}")
|
||||
elif file_suffix in xlsx_suffixes:
|
||||
need_remove_index.append(i)
|
||||
logger.warning(f"Currently, XLSX files are not supported: {pdf_file_name}")
|
||||
|
||||
return need_remove_index
|
||||
|
||||
|
||||
def do_parse(
|
||||
output_dir,
|
||||
pdf_file_names: list[str],
|
||||
@@ -434,6 +498,24 @@ def do_parse(
|
||||
end_page_id=None,
|
||||
**kwargs,
|
||||
):
|
||||
need_remove_index = _process_office_doc(
|
||||
output_dir,
|
||||
pdf_file_names=pdf_file_names,
|
||||
pdf_bytes_list=pdf_bytes_list,
|
||||
f_dump_md=f_dump_md,
|
||||
f_dump_middle_json=f_dump_middle_json,
|
||||
f_dump_orig_file=f_dump_orig_pdf,
|
||||
f_dump_content_list=f_dump_content_list,
|
||||
f_make_md_mode=f_make_md_mode,
|
||||
)
|
||||
for index in sorted(need_remove_index, reverse=True):
|
||||
del pdf_bytes_list[index]
|
||||
del pdf_file_names[index]
|
||||
del p_lang_list[index]
|
||||
if not pdf_bytes_list:
|
||||
logger.warning("No valid PDF or image files to process.")
|
||||
return
|
||||
|
||||
# 预处理PDF字节数据
|
||||
pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id)
|
||||
|
||||
@@ -506,6 +588,24 @@ async def aio_do_parse(
|
||||
end_page_id=None,
|
||||
**kwargs,
|
||||
):
|
||||
need_remove_index = _process_office_doc(
|
||||
output_dir,
|
||||
pdf_file_names=pdf_file_names,
|
||||
pdf_bytes_list=pdf_bytes_list,
|
||||
f_dump_md=f_dump_md,
|
||||
f_dump_middle_json=f_dump_middle_json,
|
||||
f_dump_orig_file=f_dump_orig_pdf,
|
||||
f_dump_content_list=f_dump_content_list,
|
||||
f_make_md_mode=f_make_md_mode,
|
||||
)
|
||||
for index in sorted(need_remove_index, reverse=True):
|
||||
del pdf_bytes_list[index]
|
||||
del pdf_file_names[index]
|
||||
del p_lang_list[index]
|
||||
if not pdf_bytes_list:
|
||||
logger.warning("No valid PDF or image files to process.")
|
||||
return
|
||||
|
||||
# 预处理PDF字节数据
|
||||
pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id)
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ logger.add(sys.stderr, level=log_level) # 添加新handler
|
||||
|
||||
from base64 import b64encode
|
||||
|
||||
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
|
||||
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes, office_suffixes
|
||||
from mineru.utils.cli_parser import arg_parse
|
||||
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
||||
from mineru.version import __version__
|
||||
@@ -187,7 +187,7 @@ async def parse_pdf(
|
||||
|
||||
# 如果是图像文件或PDF,使用read_fn处理
|
||||
file_suffix = guess_suffix_by_path(temp_path)
|
||||
if file_suffix in pdf_suffixes + image_suffixes:
|
||||
if file_suffix in pdf_suffixes + image_suffixes + office_suffixes:
|
||||
try:
|
||||
pdf_bytes = read_fn(temp_path)
|
||||
pdf_bytes_list.append(pdf_bytes)
|
||||
|
||||
0
mineru/model/docx/__init__.py
Normal file
0
mineru/model/docx/__init__.py
Normal file
1539
mineru/model/docx/docx_converter.py
Normal file
1539
mineru/model/docx/docx_converter.py
Normal file
File diff suppressed because it is too large
Load Diff
BIN
mineru/model/docx/drawingml.docx
Normal file
BIN
mineru/model/docx/drawingml.docx
Normal file
Binary file not shown.
BIN
mineru/model/docx/equations.docx
Normal file
BIN
mineru/model/docx/equations.docx
Normal file
Binary file not shown.
18
mineru/model/docx/main.py
Normal file
18
mineru/model/docx/main.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from typing import BinaryIO
|
||||
|
||||
from mineru.model.docx.docx_converter import DocxConverter
|
||||
|
||||
|
||||
def convert_path(file_path: str):
|
||||
with open(file_path, "rb") as fh:
|
||||
return convert_binary(fh)
|
||||
|
||||
|
||||
def convert_binary(file_binary: BinaryIO):
|
||||
converter = DocxConverter()
|
||||
converter.convert(file_binary)
|
||||
return converter.pages
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(convert_path("textbox.docx"))
|
||||
BIN
mineru/model/docx/outputtest.png
Normal file
BIN
mineru/model/docx/outputtest.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.3 MiB |
2
mineru/model/docx/section (副本)/[Content_Types].xml
Normal file
2
mineru/model/docx/section (副本)/[Content_Types].xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/docProps/app.xml" ContentType="application/vnd.openxmlformats-officedocument.extended-properties+xml"/><Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/><Override PartName="/docProps/custom.xml" ContentType="application/vnd.openxmlformats-officedocument.custom-properties+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/><Override PartName="/word/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/><Override PartName="/word/footer1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"/><Override PartName="/word/header1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/header2.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/header3.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/header4.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/settings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"/><Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/><Override PartName="/word/theme/theme1.xml" ContentType="application/vnd.openxmlformats-officedocument.theme+xml"/></Types>
|
||||
2
mineru/model/docx/section (副本)/_rels/.rels
Normal file
2
mineru/model/docx/section (副本)/_rels/.rels
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" Target="docProps/app.xml"/><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties" Target="docProps/custom.xml"/></Relationships>
|
||||
2
mineru/model/docx/section (副本)/docProps/app.xml
Normal file
2
mineru/model/docx/section (副本)/docProps/app.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"><Template>Normal.dotm</Template><Pages>8</Pages><Words>0</Words><Characters>0</Characters><Lines>0</Lines><Paragraphs>0</Paragraphs><TotalTime>1</TotalTime><ScaleCrop>false</ScaleCrop><LinksUpToDate>false</LinksUpToDate><CharactersWithSpaces>0</CharactersWithSpaces><Application>WPS Office_12.1.0.17900_F1E327BC-269C-435d-A152-05C5408002CA</Application><DocSecurity>0</DocSecurity></Properties>
|
||||
2
mineru/model/docx/section (副本)/docProps/core.xml
Normal file
2
mineru/model/docx/section (副本)/docProps/core.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dcmitype="http://purl.org/dc/dcmitype/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><dcterms:created xsi:type="dcterms:W3CDTF">2026-01-11T00:35:00Z</dcterms:created><dc:creator>sidney</dc:creator><cp:lastModifiedBy>sidney</cp:lastModifiedBy><dcterms:modified xsi:type="dcterms:W3CDTF">2026-01-09T16:53:33Z</dcterms:modified><cp:revision>1</cp:revision></cp:coreProperties>
|
||||
2
mineru/model/docx/section (副本)/docProps/custom.xml
Normal file
2
mineru/model/docx/section (副本)/docProps/custom.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"><property fmtid="{D5CDD505-2E9C-101B-9397-08002B2CF9AE}" pid="2" name="KSOProductBuildVer"><vt:lpwstr>2052-12.1.0.17900</vt:lpwstr></property><property fmtid="{D5CDD505-2E9C-101B-9397-08002B2CF9AE}" pid="3" name="ICV"><vt:lpwstr>DFF0AFE5816D6E22E9BD60691B8F4357_41</vt:lpwstr></property></Properties>
|
||||
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId9" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable" Target="fontTable.xml"/><Relationship Id="rId8" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme" Target="theme/theme1.xml"/><Relationship Id="rId7" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header4.xml"/><Relationship Id="rId6" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header3.xml"/><Relationship Id="rId5" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header2.xml"/><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" Target="footer1.xml"/><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header1.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/></Relationships>
|
||||
261
mineru/model/docx/section (副本)/word/document.xml
Normal file
261
mineru/model/docx/section (副本)/word/document.xml
Normal file
@@ -0,0 +1,261 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas"
|
||||
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||
xmlns:o="urn:schemas-microsoft-com:office:office"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml"
|
||||
xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing"
|
||||
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
|
||||
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"
|
||||
xmlns:w10="urn:schemas-microsoft-com:office:word"
|
||||
xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml"
|
||||
xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup"
|
||||
xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk"
|
||||
xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
|
||||
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
|
||||
xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14">
|
||||
<w:body>
|
||||
<w:p w14:paraId="187C1D8D">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>第一节内容</w:t>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>1</w:t>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:br w:type="page"/>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="7BED7957">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>第一节内容</w:t>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>2</w:t>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:br w:type="page"/>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="786FF74E">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:sectPr>
|
||||
<w:headerReference r:id="rId3" w:type="default"/>
|
||||
<w:footerReference r:id="rId4" w:type="default"/>
|
||||
<w:pgSz w:w="11906" w:h="16838"/>
|
||||
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="851" w:footer="992"
|
||||
w:gutter="0"/>
|
||||
<w:cols w:space="425" w:num="1"/>
|
||||
<w:docGrid w:type="lines" w:linePitch="312" w:charSpace="0"/>
|
||||
</w:sectPr>
|
||||
</w:pPr>
|
||||
</w:p>
|
||||
<w:p w14:paraId="544E2025">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>第二节内容</w:t>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>1</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="620A8F4A">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:br w:type="page"/>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="1545E12B">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:sectPr>
|
||||
<w:headerReference r:id="rId5" w:type="default"/>
|
||||
<w:pgSz w:w="11906" w:h="16838"/>
|
||||
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="851" w:footer="992"
|
||||
w:gutter="0"/>
|
||||
<w:cols w:space="425" w:num="1"/>
|
||||
<w:docGrid w:type="lines" w:linePitch="312" w:charSpace="0"/>
|
||||
</w:sectPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>第二节内容2</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="206DE633">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>第三节内容1</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="35E87C09">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:br w:type="page"/>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="1977A116">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:sectPr>
|
||||
<w:headerReference r:id="rId6" w:type="default"/>
|
||||
<w:pgSz w:w="11906" w:h="16838"/>
|
||||
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="851" w:footer="992"
|
||||
w:gutter="0"/>
|
||||
<w:cols w:space="425" w:num="1"/>
|
||||
<w:docGrid w:type="lines" w:linePitch="312" w:charSpace="0"/>
|
||||
</w:sectPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>第三节内容2</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="43AB5318">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>第四节内容1</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="6FDEB506">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:br w:type="page"/>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:p w14:paraId="04B961C8">
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="default"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:hint="eastAsia"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN"/>
|
||||
</w:rPr>
|
||||
<w:t>第四节内容2</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:sectPr>
|
||||
<w:headerReference r:id="rId7" w:type="default"/>
|
||||
<w:pgSz w:w="11906" w:h="16838"/>
|
||||
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="851" w:footer="992"
|
||||
w:gutter="0"/>
|
||||
<w:cols w:space="425" w:num="1"/>
|
||||
<w:docGrid w:type="lines" w:linePitch="312" w:charSpace="0"/>
|
||||
</w:sectPr>
|
||||
</w:body>
|
||||
</w:document>
|
||||
2
mineru/model/docx/section (副本)/word/fontTable.xml
Normal file
2
mineru/model/docx/section (副本)/word/fontTable.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" mc:Ignorable="w14"><w:font w:name="Times New Roman"><w:panose1 w:val="02020603050405020304"/><w:charset w:val="00"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="20007A87" w:usb1="80000000" w:usb2="00000008" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="宋体"><w:altName w:val="Droid Sans Fallback"/><w:panose1 w:val="00000000000000000000"/><w:charset w:val="00"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000000" w:csb1="00000000"/></w:font><w:font w:name="Wingdings"><w:altName w:val="Noto Color Emoji"/><w:panose1 w:val="05000000000000000000"/><w:charset w:val="02"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/></w:font><w:font w:name="Arial"><w:altName w:val="DejaVu Sans"/><w:panose1 w:val="020B0604020202020204"/><w:charset w:val="01"/><w:family w:val="swiss"/><w:pitch w:val="default"/><w:sig w:usb0="E0002AFF" w:usb1="C0007843" w:usb2="00000009" w:usb3="00000000" w:csb0="400001FF" w:csb1="FFFF0000"/></w:font><w:font w:name="黑体"><w:altName w:val="Droid Sans Fallback"/><w:panose1 w:val="02010609060101010101"/><w:charset w:val="86"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="800002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="00040001" w:csb1="00000000"/></w:font><w:font w:name="Courier New"><w:altName w:val="DejaVu Sans"/><w:panose1 w:val="02070309020205020404"/><w:charset w:val="01"/><w:family w:val="modern"/><w:pitch w:val="default"/><w:sig w:usb0="E0002AFF" w:usb1="C0007843" w:usb2="00000009" w:usb3="00000000" w:csb0="400001FF" w:csb1="FFFF0000"/></w:font><w:font w:name="Symbol"><w:altName w:val="Noto Color Emoji"/><w:panose1 w:val="05050102010706020507"/><w:charset w:val="02"/><w:family w:val="roman"/><w:pitch w:val="default"/><w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/></w:font><w:font w:name="DejaVu Sans"><w:panose1 w:val="020B0603030804020204"/><w:charset w:val="00"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="E7006EFF" w:usb1="D200FDFF" w:usb2="0A246029" w:usb3="0400200C" w:csb0="600001FF" w:csb1="DFFF0000"/></w:font><w:font w:name="Calibri"><w:altName w:val="DejaVu Sans"/><w:panose1 w:val="020F0502020204030204"/><w:charset w:val="00"/><w:family w:val="swiss"/><w:pitch w:val="default"/><w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00000001" w:usb3="00000000" w:csb0="0000019F" w:csb1="00000000"/></w:font><w:font w:name="Droid Sans Fallback"><w:panose1 w:val="020B0502000000000001"/><w:charset w:val="86"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="910002FF" w:usb1="2BDFFCFB" w:usb2="00000036" w:usb3="00000000" w:csb0="203F01FF" w:csb1="D7FF0000"/></w:font><w:font w:name="Noto Color Emoji"><w:panose1 w:val="02000609000000000000"/><w:charset w:val="00"/><w:family w:val="auto"/><w:pitch w:val="default"/><w:sig w:usb0="00000001" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000001" w:csb1="00000000"/></w:font></w:fonts>
|
||||
2
mineru/model/docx/section (副本)/word/footer1.xml
Normal file
2
mineru/model/docx/section (副本)/word/footer1.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:ftr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="7D3B533F"><w:pPr><w:pStyle w:val="2"/></w:pPr></w:p></w:ftr>
|
||||
2
mineru/model/docx/section (副本)/word/header1.xml
Normal file
2
mineru/model/docx/section (副本)/word/header1.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="4B194AF1"><w:pPr><w:pStyle w:val="3"/><w:rPr><w:rFonts w:hint="default"/><w:lang w:val="en-US" w:eastAsia="zh-CN"/></w:rPr></w:pPr><w:r><w:rPr><w:rFonts w:hint="eastAsia"/><w:lang w:eastAsia="zh-CN"/></w:rPr><w:t>第一节页眉</w:t></w:r></w:p></w:hdr>
|
||||
2
mineru/model/docx/section (副本)/word/header2.xml
Normal file
2
mineru/model/docx/section (副本)/word/header2.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="3762EF24"><w:pPr><w:pStyle w:val="3"/><w:rPr><w:rFonts w:hint="default"/><w:lang w:val="en-US" w:eastAsia="zh-CN"/></w:rPr></w:pPr><w:r><w:rPr><w:rFonts w:hint="eastAsia"/><w:lang w:eastAsia="zh-CN"/></w:rPr><w:t>第二节页眉</w:t></w:r></w:p></w:hdr>
|
||||
2
mineru/model/docx/section (副本)/word/header3.xml
Normal file
2
mineru/model/docx/section (副本)/word/header3.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="4D235CA5"><w:pPr><w:pStyle w:val="3"/><w:rPr><w:rFonts w:hint="default"/><w:lang w:val="en-US" w:eastAsia="zh-CN"/></w:rPr></w:pPr><w:bookmarkStart w:id="0" w:name="_GoBack"/><w:r><w:rPr><w:rFonts w:hint="eastAsia"/><w:lang w:eastAsia="zh-CN"/></w:rPr><w:t>第三节页眉</w:t></w:r></w:p><w:bookmarkEnd w:id="0"/></w:hdr>
|
||||
2
mineru/model/docx/section (副本)/word/header4.xml
Normal file
2
mineru/model/docx/section (副本)/word/header4.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14 w15 wp14"><w:p w14:paraId="17918B03"><w:pPr><w:pStyle w:val="3"/><w:rPr><w:rFonts w:hint="default"/><w:lang w:val="en-US" w:eastAsia="zh-CN"/></w:rPr></w:pPr><w:r><w:rPr><w:rFonts w:hint="eastAsia"/><w:lang w:eastAsia="zh-CN"/></w:rPr><w:t>第四节页眉</w:t></w:r></w:p></w:hdr>
|
||||
2
mineru/model/docx/section (副本)/word/settings.xml
Normal file
2
mineru/model/docx/section (副本)/word/settings.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:settings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:sl="http://schemas.openxmlformats.org/schemaLibrary/2006/main" xmlns:wpsCustomData="http://www.wps.cn/officeDocument/2013/wpsCustomData" mc:Ignorable="w14"><w:zoom w:percent="60"/><w:embedSystemFonts/><w:bordersDoNotSurroundHeader w:val="1"/><w:bordersDoNotSurroundFooter w:val="1"/><w:documentProtection w:enforcement="0"/><w:defaultTabStop w:val="420"/><w:drawingGridVerticalSpacing w:val="156"/><w:displayHorizontalDrawingGridEvery w:val="0"/><w:displayVerticalDrawingGridEvery w:val="2"/><w:characterSpacingControl w:val="compressPunctuation"/><w:compat><w:spaceForUL/><w:balanceSingleByteDoubleByteWidth/><w:doNotLeaveBackslashAlone/><w:ulTrailSpace/><w:doNotExpandShiftReturn/><w:adjustLineHeightInTable/><w:useFELayout/><w:compatSetting w:name="compatibilityMode" w:uri="http://schemas.microsoft.com/office/word" w:val="14"/><w:compatSetting w:name="overrideTableStyleFontSizeAndJustification" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="enableOpenTypeFeatures" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="doNotFlipMirrorIndents" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/></w:compat><w:rsids><w:rsidRoot w:val="B8F19361"/><w:rsid w:val="3FDD74DB"/><w:rsid w:val="5ECF68F6"/><w:rsid w:val="6BC3D698"/><w:rsid w:val="777D451B"/><w:rsid w:val="7F39D9A1"/><w:rsid w:val="B8F19361"/><w:rsid w:val="FDF770BE"/></w:rsids><m:mathPr><m:mathFont m:val="Cambria Math"/><m:brkBin m:val="before"/><m:brkBinSub m:val="--"/><m:smallFrac m:val="0"/><m:dispDef/><m:lMargin m:val="0"/><m:rMargin m:val="0"/><m:defJc m:val="centerGroup"/><m:wrapIndent m:val="1440"/><m:intLim m:val="subSup"/><m:naryLim m:val="undOvr"/></m:mathPr><w:themeFontLang w:val="en-US" w:eastAsia="zh-CN"/><w:clrSchemeMapping w:bg1="light1" w:t1="dark1" w:bg2="light2" w:t2="dark2" w:accent1="accent1" w:accent2="accent2" w:accent3="accent3" w:accent4="accent4" w:accent5="accent5" w:accent6="accent6" w:hyperlink="hyperlink" w:followedHyperlink="followedHyperlink"/><w:doNotIncludeSubdocsInStats/></w:settings>
|
||||
2
mineru/model/docx/section (副本)/word/styles.xml
Normal file
2
mineru/model/docx/section (副本)/word/styles.xml
Normal file
File diff suppressed because one or more lines are too long
2
mineru/model/docx/section (副本)/word/theme/theme1.xml
Normal file
2
mineru/model/docx/section (副本)/word/theme/theme1.xml
Normal file
File diff suppressed because one or more lines are too long
BIN
mineru/model/docx/section.docx
Normal file
BIN
mineru/model/docx/section.docx
Normal file
Binary file not shown.
BIN
mineru/model/docx/test.docx
Normal file
BIN
mineru/model/docx/test.docx
Normal file
Binary file not shown.
144
mineru/model/docx/test.html
Normal file
144
mineru/model/docx/test.html
Normal file
@@ -0,0 +1,144 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Title</title>
|
||||
<p><a id="_Toc411426751"></a><strong>采购合同</strong></p>
|
||||
<p>合同编号: </p>
|
||||
<p>签订地点: </p>
|
||||
<p>签订时间: </p>
|
||||
<p>采购人(甲方): </p>
|
||||
<p>供应商(乙方): </p>
|
||||
<p><a id="_九、其他"></a><a id="_Toc217446115"></a>根据《中华人民共和国政府采购法》、《中华人民共和国合同法》及XX政府采购中心
|
||||
采购项目(项目编号:XX)的《招标文件》、乙方的《投标文件》及《中标通知书》,甲、乙双方同意签订本合同。详细技术说明及其他有关合同项目的特定信息由合同附件予以说明,合同附件及本项目的招标文件、投标文件、《中标通知书》等均为本合同不可分割的部分。双方同意共同遵守如下条款:
|
||||
</p>
|
||||
<p><a id="_Toc308164852"></a><a id="_Toc217446107"></a>一、合同货物</p>
|
||||
<table>
|
||||
<tr>
|
||||
<td rowspan="2"><p>货物</p>
|
||||
<p>品名</p></td>
|
||||
<td rowspan="2"><p>规格</p>
|
||||
<p>型号</p></td>
|
||||
<td rowspan="2"><p>单位</p></td>
|
||||
<td rowspan="2"><p>数量</p></td>
|
||||
<td rowspan="2"><p>单价</p>
|
||||
<p>(万元)</p></td>
|
||||
<td rowspan="2"><p>总价(万元)</p></td>
|
||||
<td colspan="3"><p>资金来源(万元)</p></td>
|
||||
<td rowspan="2"><p>随机</p>
|
||||
<p>配件</p></td>
|
||||
<td rowspan="2"><p>交货期</p></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><p>预算内</p></td>
|
||||
<td><p>预算外</p></td>
|
||||
<td><p>其他</p></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td><p> </p></td>
|
||||
<td></td>
|
||||
<td><p> </p></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td><p> </p></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td><p> </p></td>
|
||||
<td></td>
|
||||
<td><p> </p></td>
|
||||
</tr>
|
||||
</table>
|
||||
<p><a id="_Toc308164853"></a><a id="_Toc217446108"></a>二、合同总价</p>
|
||||
<p>合同总价为人民币大写: 元,即RMB¥
|
||||
元;该合同总价已包括货物设计、材料、制造、包装、运输、安装、调试、检测、验收合格交付使用之前及保修期内保修服务与备用物件等等所有其他有关各项的含税费用。本合同执行期间合同总价不变,甲方无须另向乙方支付本合同规定之外的其他任何费用。</p>
|
||||
<p><a id="_Toc308164854"></a><a id="_Toc217446109"></a>三、质量要求</p>
|
||||
<p>1、乙方须提供全新的货物(含零部件、配件等),表面无划伤、无碰撞痕迹,且权属清楚,不得侵害他人的知识产权。</p>
|
||||
<p>2、货物必须符合或优于国家(行业) 标准,以及本项目招标文件的质量要求和技术指标与出厂标准。</p>
|
||||
<p>3、乙方须在本合同签订之日起
|
||||
日内送交货物成品样品给甲方确认,在甲方出具样品确认书并封存成品样品外观尺寸后,乙方才能按样生产,并以此样品作为验收样品;每台货物上均应有产品质量检验合格标志。</p>
|
||||
<p>
|
||||
4、货物制造质量出现问题,乙方应负责三包(包修、包换、包退),费用由乙方负担,甲方有权到乙方生产场地检查货物质量和生产进度。</p>
|
||||
<p>5、货到现场后由于甲方保管不当造成的质量问题,乙方亦应负责修理,但费用由甲方负担。</p>
|
||||
<p><a id="_Toc308164855"></a><a id="_Toc217446110"></a>四、交货及验收</p>
|
||||
<p>1、乙方交货期限为合同签订生效后的 日内,在合同签订生效之日起 天内交货到甲方指定地点,随即在
|
||||
日内全部完成安装调试验收合格交付使用,并且最迟应在 年 月
|
||||
日前全部完成安装调试验收合格交付使用(如由于采购人的原因造成合同延迟签订或验收的,时间顺延)。交货验收时须提供产品质检部门从同类产品中抽样检查合格的检测报告。</p>
|
||||
<p>2、验收由甲方组织,乙方配合进行:</p>
|
||||
<p>(1) 货物在乙方通知安装调试完毕后 日内初步验收。初步验收合格后,进入 试用期;试用期间发生重大质量问题,修复后试用相应顺延;试用期结束后
|
||||
日内完成最终验收;</p>
|
||||
<p>(2)
|
||||
验收标准:按国家有关规定以及甲方招标文件的质量要求和技术指标、乙方的投标文件及承诺与本合同约定标准进行验收;甲乙双方如对质量要求和技术指标的约定标准有相互抵触或异议的事项,由甲方在招标与投标文件中按质量要求和技术指标比较优胜的原则确定该项的约定标准进行验收;</p>
|
||||
<p>(3)
|
||||
验收时如发现所交付的货物有短装、次品、损坏或其它不符合标准及本合同规定之情形者,甲方应做出详尽的现场记录,或由甲乙双方签署备忘录,此现场记录或备忘录可用作补充、缺失和更换损坏部件的有效证据,由此产生的时间延误与有关费用由乙方承担,验收期限相应顺延;</p>
|
||||
<p>(4) 如质量验收合格,双方签署质量验收报告。</p>
|
||||
<p>3、货物安装完成后 日内,甲方无故不进行验收工作并已使用货物的,视同已安装调试完成并验收合格。</p>
|
||||
<p>4、乙方不能完整交付货物必须负责补齐,否则视为未按合同约定交货。</p>
|
||||
<p>5、如货物经乙方
|
||||
次维修仍不能达到合同约定的质量标准,甲方有权退货,并视作乙方不能交付货物而须支付违约赔偿金给甲方,甲方还可依法追究乙方的违约责任。 </p>
|
||||
<p><a id="_Toc308164856"></a><a id="_Toc217446111"></a>五、付款方式</p>
|
||||
<p>1、甲方在本合同签订生效之日起计算款额¥ 元,人民币大写: 元整)后的 日内支付合同金额百分之
|
||||
的价款(根据磋商文件要求);</p>
|
||||
<p>2、全部货物安装调试完毕并验收合格之日起,甲方接到乙方通知与票据凭证资料以后的 日内,向乙方核拨合同总价的百分之 款项:¥
|
||||
元,人民币大写 元整;</p>
|
||||
<p>3、合同履约保证金:在货物验收合格满 后,甲方财务部门接到乙方通知和支付凭证资料文件,以及由甲方确认本合同货物质量与服务等约定事项已经履行完毕的正式书面文件后的
|
||||
日内,递交结算凭证资料给银行并由其向乙方支付价款¥ 元, 人民币大写: 元整(根据招标文件要求);</p>
|
||||
<p>4、乙方须向甲方出具合法有效完整的完税发票及凭证资料进行支付结算。</p>
|
||||
<p><a id="_Toc217446112"></a><a id="_Toc308164857"></a>六、售后服务</p>
|
||||
<p>1、质保期为验收合格后 年,质保期内出现质量问题,乙方在接到通知后 小时内响应到场, 小时内完成维修或更换,并承担修理调换的费用;如货物经乙方
|
||||
次维修仍不能达到本合同约定的质量标准,视作乙方未能按时交货,甲方有权退货并追究乙方的违约责任。货到现场后由于甲方保管不当造成的问题,乙方亦应负责修复,但费用由甲方负担。</p>
|
||||
<p>2、乙方须指派专人负责与甲方联系售后服务事宜。 </p>
|
||||
<p><a id="_Toc217446113"></a><a id="_Toc308164858"></a>七、违约责任</p>
|
||||
<p>1、甲方违约责任</p>
|
||||
<p>(1) 甲方无正当理由拒收货物的,甲方应偿付合同总价百分之 的违约金;</p>
|
||||
<p>(2) 甲方逾期支付货款的,除应及时付足货款外,应向乙方偿付欠款总额万分之 /天的违约金;逾期付款超过
|
||||
天的,乙方有权终止合同;</p>
|
||||
<p>(3) 甲方偿付的违约金不足以弥补乙方损失的,还应按乙方损失尚未弥补的部分,支付赔偿金给乙方。</p>
|
||||
<p>2、乙方违约责任</p>
|
||||
<p>(1)乙方交付的货物质量不符合合同规定的,乙方应向甲方支付合同总价的百分之
|
||||
的违约金,并须在合同规定的交货时间内更换合格的货物给甲方,否则,视作乙方不能交付货物而违约,按本条本款下述第“(2)”项规定由乙方偿付违约赔偿金给甲方。</p>
|
||||
<p>(2)乙方不能交付货物或逾期交付货物而违约的,除应及时交足货物外,应向甲方偿付逾期交货部分货款总额的万分之
|
||||
/天的违约金;逾期交货超过 天,甲方有权终止合同,乙方则应按合同总价的百分之
|
||||
的款额向甲方偿付赔偿金,并须全额退还甲方已经付给乙方的货款及其利息。</p>
|
||||
<p>(3)乙方货物经甲方送交具有法定资格条件的质量技术监督机构检测后,如检测结果认定货物质量不符合本合同规定标准的,则视为乙方没有按时交货而违约,乙方须在
|
||||
天内无条件更换合格的货物,如逾期不能更换合格的货物,甲方有权终止本合同,乙方应另付合同总价的百分之
|
||||
的赔偿金给甲方。</p>
|
||||
<p>(4)乙方保证本合同货物的权利无瑕疵,包括货物所有权及知识产权等权利无瑕疵。如任何第三方经法院(或仲裁机构)裁决有权对上述货物主张权利或国家机关依法对货物进行没收查处的,乙方除应向甲方返还已收款项外,还应另按合同总价的百分之
|
||||
向甲方支付违约金并赔偿因此给甲方造成的一切损失。</p>
|
||||
<p>(5)乙方偿付的违约金不足以弥补甲方损失的,还应按甲方损失尚未弥补的部分,支付赔偿金给甲方。</p>
|
||||
<p><a id="_Toc308164859"></a><a id="_Toc217446114"></a>八、争议解决办法</p>
|
||||
<p>
|
||||
1、因货物的质量问题发生争议,由质量技术监督部门或其指定的质量鉴定机构进行质量鉴定。货物符合标准的,鉴定费由甲方承担;货物不符合质量标准的,鉴定费由乙方承担。</p>
|
||||
<p>2、合同履行期间,若双方发生争议,可协商或由有关部门调解解决,协商或调解不成的,由当事人依法维护其合法权益。</p>
|
||||
<p><a id="_Toc308164860"></a>九、其他</p>
|
||||
<p>1、如有未尽事宜,由双方依法订立补充合同。</p>
|
||||
<p>2、本合同双方应加盖骑缝章。</p>
|
||||
<p>3、本合同一式四份,自双方签章并经省政府采购中心审核编号后生效。甲方、乙方、政府采购管理部门、 省政府采购中心各一份。</p>
|
||||
<p>甲方: (盖章) 乙方: (盖章)</p>
|
||||
<p>法定代表人(授权代表): 法定代表人(授权代表):</p>
|
||||
<p>地 址: 地 址:</p>
|
||||
<p>开户银行: 开户银行:</p>
|
||||
<p>账号: 账号:</p>
|
||||
<p>电 话: 电 话:</p>
|
||||
<p>传 真: 传 真:</p>
|
||||
<p>签约日期:年 月 日 签约日期: 年 月 </p>
|
||||
<p><img alt="185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739"
|
||||
src="output/test/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739.jpg"/></p>
|
||||
<p>发票 1</p>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
127
mineru/model/docx/test.md
Normal file
127
mineru/model/docx/test.md
Normal file
@@ -0,0 +1,127 @@
|
||||
**采购合同**
|
||||
|
||||
合同编号:
|
||||
|
||||
签订地点:
|
||||
|
||||
签订时间:
|
||||
|
||||
采购人(甲方):
|
||||
|
||||
供应商(乙方):
|
||||
|
||||
根据《中华人民共和国政府采购法》、《中华人民共和国合同法》及XX政府采购中心 采购项目(项目编号:XX)的《招标文件》、乙方的《投标文件》及《中标通知书》,甲、乙双方同意签订本合同。详细技术说明及其他有关合同项目的特定信息由合同附件予以说明,合同附件及本项目的招标文件、投标文件、《中标通知书》等均为本合同不可分割的部分。双方同意共同遵守如下条款:
|
||||
|
||||
一、合同货物
|
||||
|
||||
<table><tr><td rowspan="2"><p>货物</p><p>品名</p></td><td rowspan="2"><p>规格</p><p>型号</p></td><td rowspan="2"><p>单位</p></td><td rowspan="2"><p>数量</p></td><td rowspan="2"><p>单价</p><p>(万元)</p></td><td rowspan="2"><p>总价(万元)</p></td><td colspan="3"><p>资金来源(万元)</p></td><td rowspan="2"><p>随机</p><p>配件</p></td><td rowspan="2"><p>交货期</p></td></tr><tr><td><p>预算内</p></td><td><p>预算外</p></td><td><p>其他</p></td></tr><tr><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td></td><td></td><td><p> </p></td><td></td><td><p> </p></td></tr><tr><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td><p> </p></td><td></td><td></td><td><p> </p></td><td></td><td><p> </p></td></tr></table>
|
||||
|
||||
二、合同总价
|
||||
|
||||
合同总价为人民币大写: 元,即RMB¥ 元;该合同总价已包括货物设计、材料、制造、包装、运输、安装、调试、检测、验收合格交付使用之前及保修期内保修服务与备用物件等等所有其他有关各项的含税费用。本合同执行期间合同总价不变,甲方无须另向乙方支付本合同规定之外的其他任何费用。
|
||||
|
||||
三、质量要求
|
||||
|
||||
1、乙方须提供全新的货物(含零部件、配件等),表面无划伤、无碰撞痕迹,且权属清楚,不得侵害他人的知识产权。
|
||||
|
||||
2、货物必须符合或优于国家(行业) 标准,以及本项目招标文件的质量要求和技术指标与出厂标准。
|
||||
|
||||
3、乙方须在本合同签订之日起 日内送交货物成品样品给甲方确认,在甲方出具样品确认书并封存成品样品外观尺寸后,乙方才能按样生产,并以此样品作为验收样品;每台货物上均应有产品质量检验合格标志。
|
||||
|
||||
4、货物制造质量出现问题,乙方应负责三包(包修、包换、包退),费用由乙方负担,甲方有权到乙方生产场地检查货物质量和生产进度。
|
||||
|
||||
5、货到现场后由于甲方保管不当造成的质量问题,乙方亦应负责修理,但费用由甲方负担。
|
||||
|
||||
四、交货及验收
|
||||
|
||||
1、乙方交货期限为合同签订生效后的 日内,在合同签订生效之日起 天内交货到甲方指定地点,随即在 日内全部完成安装调试验收合格交付使用,并且最迟应在 年 月 日前全部完成安装调试验收合格交付使用(如由于采购人的原因造成合同延迟签订或验收的,时间顺延)。交货验收时须提供产品质检部门从同类产品中抽样检查合格的检测报告。
|
||||
|
||||
2、验收由甲方组织,乙方配合进行:
|
||||
|
||||
(1) 货物在乙方通知安装调试完毕后 日内初步验收。初步验收合格后,进入 试用期;试用期间发生重大质量问题,修复后试用相应顺延;试用期结束后 日内完成最终验收;
|
||||
|
||||
(2) 验收标准:按国家有关规定以及甲方招标文件的质量要求和技术指标、乙方的投标文件及承诺与本合同约定标准进行验收;甲乙双方如对质量要求和技术指标的约定标准有相互抵触或异议的事项,由甲方在招标与投标文件中按质量要求和技术指标比较优胜的原则确定该项的约定标准进行验收;
|
||||
|
||||
(3) 验收时如发现所交付的货物有短装、次品、损坏或其它不符合标准及本合同规定之情形者,甲方应做出详尽的现场记录,或由甲乙双方签署备忘录,此现场记录或备忘录可用作补充、缺失和更换损坏部件的有效证据,由此产生的时间延误与有关费用由乙方承担,验收期限相应顺延;
|
||||
|
||||
(4) 如质量验收合格,双方签署质量验收报告。
|
||||
|
||||
3、货物安装完成后 日内,甲方无故不进行验收工作并已使用货物的,视同已安装调试完成并验收合格。
|
||||
|
||||
4、乙方不能完整交付货物必须负责补齐,否则视为未按合同约定交货。
|
||||
|
||||
5、如货物经乙方 次维修仍不能达到合同约定的质量标准,甲方有权退货,并视作乙方不能交付货物而须支付违约赔偿金给甲方,甲方还可依法追究乙方的违约责任。
|
||||
|
||||
五、付款方式
|
||||
|
||||
1、甲方在本合同签订生效之日起计算款额¥ 元,人民币大写: 元整)后的 日内支付合同金额百分之 的价款(根据磋商文件要求);
|
||||
|
||||
2、全部货物安装调试完毕并验收合格之日起,甲方接到乙方通知与票据凭证资料以后的 日内,向乙方核拨合同总价的百分之 款项:¥ 元,人民币大写 元整;
|
||||
|
||||
3、合同履约保证金:在货物验收合格满 后,甲方财务部门接到乙方通知和支付凭证资料文件,以及由甲方确认本合同货物质量与服务等约定事项已经履行完毕的正式书面文件后的 日内,递交结算凭证资料给银行并由其向乙方支付价款¥ 元, 人民币大写: 元整(根据招标文件要求);
|
||||
|
||||
4、乙方须向甲方出具合法有效完整的完税发票及凭证资料进行支付结算。
|
||||
|
||||
六、售后服务
|
||||
|
||||
1、质保期为验收合格后 年,质保期内出现质量问题,乙方在接到通知后 小时内响应到场, 小时内完成维修或更换,并承担修理调换的费用;如货物经乙方 次维修仍不能达到本合同约定的质量标准,视作乙方未能按时交货,甲方有权退货并追究乙方的违约责任。货到现场后由于甲方保管不当造成的问题,乙方亦应负责修复,但费用由甲方负担。
|
||||
|
||||
2、乙方须指派专人负责与甲方联系售后服务事宜。
|
||||
|
||||
七、违约责任
|
||||
|
||||
1、甲方违约责任
|
||||
|
||||
(1) 甲方无正当理由拒收货物的,甲方应偿付合同总价百分之 的违约金;
|
||||
|
||||
(2) 甲方逾期支付货款的,除应及时付足货款外,应向乙方偿付欠款总额万分之 /天的违约金;逾期付款超过 天的,乙方有权终止合同;
|
||||
|
||||
(3) 甲方偿付的违约金不足以弥补乙方损失的,还应按乙方损失尚未弥补的部分,支付赔偿金给乙方。
|
||||
|
||||
2、乙方违约责任
|
||||
|
||||
(1)乙方交付的货物质量不符合合同规定的,乙方应向甲方支付合同总价的百分之 的违约金,并须在合同规定的交货时间内更换合格的货物给甲方,否则,视作乙方不能交付货物而违约,按本条本款下述第“(2)”项规定由乙方偿付违约赔偿金给甲方。
|
||||
|
||||
(2)乙方不能交付货物或逾期交付货物而违约的,除应及时交足货物外,应向甲方偿付逾期交货部分货款总额的万分之 /天的违约金;逾期交货超过 天,甲方有权终止合同,乙方则应按合同总价的百分之 的款额向甲方偿付赔偿金,并须全额退还甲方已经付给乙方的货款及其利息。
|
||||
|
||||
(3)乙方货物经甲方送交具有法定资格条件的质量技术监督机构检测后,如检测结果认定货物质量不符合本合同规定标准的,则视为乙方没有按时交货而违约,乙方须在 天内无条件更换合格的货物,如逾期不能更换合格的货物,甲方有权终止本合同,乙方应另付合同总价的百分之 的赔偿金给甲方。
|
||||
|
||||
(4)乙方保证本合同货物的权利无瑕疵,包括货物所有权及知识产权等权利无瑕疵。如任何第三方经法院(或仲裁机构)裁决有权对上述货物主张权利或国家机关依法对货物进行没收查处的,乙方除应向甲方返还已收款项外,还应另按合同总价的百分之 向甲方支付违约金并赔偿因此给甲方造成的一切损失。
|
||||
|
||||
(5)乙方偿付的违约金不足以弥补甲方损失的,还应按甲方损失尚未弥补的部分,支付赔偿金给甲方。
|
||||
|
||||
八、争议解决办法
|
||||
|
||||
1、因货物的质量问题发生争议,由质量技术监督部门或其指定的质量鉴定机构进行质量鉴定。货物符合标准的,鉴定费由甲方承担;货物不符合质量标准的,鉴定费由乙方承担。
|
||||
|
||||
2、合同履行期间,若双方发生争议,可协商或由有关部门调解解决,协商或调解不成的,由当事人依法维护其合法权益。
|
||||
|
||||
九、其他
|
||||
|
||||
1、如有未尽事宜,由双方依法订立补充合同。
|
||||
|
||||
2、本合同双方应加盖骑缝章。
|
||||
|
||||
3、本合同一式四份,自双方签章并经省政府采购中心审核编号后生效。甲方、乙方、政府采购管理部门、 省政府采购中心各一份。
|
||||
|
||||
甲方: (盖章) 乙方: (盖章)
|
||||
|
||||
法定代表人(授权代表): 法定代表人(授权代表):
|
||||
|
||||
地 址: 地 址:
|
||||
|
||||
开户银行: 开户银行:
|
||||
|
||||
账号: 账号:
|
||||
|
||||
电 话: 电 话:
|
||||
|
||||
传 真: 传 真:
|
||||
|
||||
签约日期:年 月 日 签约日期: 年 月
|
||||
|
||||

|
||||
|
||||
发票 1
|
||||
|
||||
进程已结束,退出代码为 0
|
||||
BIN
mineru/model/docx/test.png
Normal file
BIN
mineru/model/docx/test.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.3 MiB |
2
mineru/model/docx/test2/[Content_Types].xml
Normal file
2
mineru/model/docx/test2/[Content_Types].xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="jpeg" ContentType="image/jpeg"/><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/><Override PartName="/customXml/itemProps1.xml" ContentType="application/vnd.openxmlformats-officedocument.customXmlProperties+xml"/><Override PartName="/word/numbering.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml"/><Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/><Override PartName="/word/settings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"/><Override PartName="/word/webSettings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml"/><Override PartName="/word/footnotes.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml"/><Override PartName="/word/endnotes.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml"/><Override PartName="/word/header1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/header2.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/footer1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"/><Override PartName="/word/footer2.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"/><Override PartName="/word/header3.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/><Override PartName="/word/footer3.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"/><Override PartName="/word/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/><Override PartName="/word/glossary/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml"/><Override PartName="/word/glossary/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/><Override PartName="/word/glossary/settings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"/><Override PartName="/word/glossary/webSettings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml"/><Override PartName="/word/glossary/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/><Override PartName="/word/theme/theme1.xml" ContentType="application/vnd.openxmlformats-officedocument.theme+xml"/><Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/><Override PartName="/docProps/app.xml" ContentType="application/vnd.openxmlformats-officedocument.extended-properties+xml"/><Override PartName="/docProps/custom.xml" ContentType="application/vnd.openxmlformats-officedocument.custom-properties+xml"/></Types>
|
||||
2
mineru/model/docx/test2/_rels/.rels
Normal file
2
mineru/model/docx/test2/_rels/.rels
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" Target="docProps/app.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties" Target="docProps/custom.xml"/></Relationships>
|
||||
2
mineru/model/docx/test2/customXml/_rels/item1.xml.rels
Normal file
2
mineru/model/docx/test2/customXml/_rels/item1.xml.rels
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXmlProps" Target="itemProps1.xml"/></Relationships>
|
||||
1
mineru/model/docx/test2/customXml/item1.xml
Normal file
1
mineru/model/docx/test2/customXml/item1.xml
Normal file
@@ -0,0 +1 @@
|
||||
<?xml version="1.0" standalone="no"?><b:Sources xmlns:b="http://schemas.openxmlformats.org/officeDocument/2006/bibliography" xmlns="http://schemas.openxmlformats.org/officeDocument/2006/bibliography" SelectedStyle="\APASixthEditionOfficeOnline.xsl" StyleName="APA" Version="6"></b:Sources>
|
||||
2
mineru/model/docx/test2/customXml/itemProps1.xml
Normal file
2
mineru/model/docx/test2/customXml/itemProps1.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<ds:datastoreItem ds:itemID="{247BE1E9-2D8F-4D80-862D-FA139EE27171}" xmlns:ds="http://schemas.openxmlformats.org/officeDocument/2006/customXml"><ds:schemaRefs><ds:schemaRef ds:uri="http://schemas.openxmlformats.org/officeDocument/2006/bibliography"/></ds:schemaRefs></ds:datastoreItem>
|
||||
2
mineru/model/docx/test2/docProps/app.xml
Normal file
2
mineru/model/docx/test2/docProps/app.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"><Template>{2447CD0F-22A7-4A1B-A9BC-34FF8F943A34}TFf1603197-d3d8-44fc-95f7-0445aa29d9afca513b70_win32-e2ceec99d124.dotx</Template><TotalTime>6</TotalTime><Pages>2</Pages><Words>82</Words><Characters>468</Characters><Application>Microsoft Office Word</Application><DocSecurity>0</DocSecurity><Lines>3</Lines><Paragraphs>1</Paragraphs><ScaleCrop>false</ScaleCrop><HeadingPairs><vt:vector size="2" baseType="variant"><vt:variant><vt:lpstr>Title</vt:lpstr></vt:variant><vt:variant><vt:i4>1</vt:i4></vt:variant></vt:vector></HeadingPairs><TitlesOfParts><vt:vector size="1" baseType="lpstr"><vt:lpstr></vt:lpstr></vt:vector></TitlesOfParts><Company></Company><LinksUpToDate>false</LinksUpToDate><CharactersWithSpaces>549</CharactersWithSpaces><SharedDoc>false</SharedDoc><HyperlinksChanged>false</HyperlinksChanged><AppVersion>16.0000</AppVersion></Properties>
|
||||
2
mineru/model/docx/test2/docProps/core.xml
Normal file
2
mineru/model/docx/test2/docProps/core.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:dcmitype="http://purl.org/dc/dcmitype/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><dc:title></dc:title><dc:subject></dc:subject><dc:creator>Sidney Chen</dc:creator><cp:keywords></cp:keywords><dc:description></dc:description><cp:lastModifiedBy>Sidney Chen</cp:lastModifiedBy><cp:revision>1</cp:revision><dcterms:created xsi:type="dcterms:W3CDTF">2025-11-04T08:12:00Z</dcterms:created><dcterms:modified xsi:type="dcterms:W3CDTF">2025-11-04T08:18:00Z</dcterms:modified></cp:coreProperties>
|
||||
2
mineru/model/docx/test2/docProps/custom.xml
Normal file
2
mineru/model/docx/test2/docProps/custom.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"><property fmtid="{D5CDD505-2E9C-101B-9397-08002B2CF9AE}" pid="2" name="ContentTypeId"><vt:lpwstr>0x010100AA3F7D94069FF64A86F7DFF56D60E3BE</vt:lpwstr></property></Properties>
|
||||
2
mineru/model/docx/test2/word/_rels/document.xml.rels
Normal file
2
mineru/model/docx/test2/word/_rels/document.xml.rels
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId8" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.jpeg"/><Relationship Id="rId13" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header3.xml"/><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/><Relationship Id="rId7" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes" Target="endnotes.xml"/><Relationship Id="rId12" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" Target="footer2.xml"/><Relationship Id="rId17" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme" Target="theme/theme1.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering" Target="numbering.xml"/><Relationship Id="rId16" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/glossaryDocument" Target="glossary/document.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXml" Target="../customXml/item1.xml"/><Relationship Id="rId6" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes" Target="footnotes.xml"/><Relationship Id="rId11" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" Target="footer1.xml"/><Relationship Id="rId5" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings" Target="webSettings.xml"/><Relationship Id="rId15" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable" Target="fontTable.xml"/><Relationship Id="rId10" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header2.xml"/><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/><Relationship Id="rId9" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header1.xml"/><Relationship Id="rId14" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" Target="footer3.xml"/></Relationships>
|
||||
2
mineru/model/docx/test2/word/_rels/settings.xml.rels
Normal file
2
mineru/model/docx/test2/word/_rels/settings.xml.rels
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/attachedTemplate" Target="file:///C:\Users\sidney\AppData\Local\Microsoft\Office\16.0\DTS\zh-CN%7bFCF86E55-83C6-41EC-B02E-A4F386444BB8%7d\%7b2447CD0F-22A7-4A1B-A9BC-34FF8F943A34%7dTFf1603197-d3d8-44fc-95f7-0445aa29d9afca513b70_win32-e2ceec99d124.dotx" TargetMode="External"/></Relationships>
|
||||
2996
mineru/model/docx/test2/word/document.xml
Normal file
2996
mineru/model/docx/test2/word/document.xml
Normal file
File diff suppressed because it is too large
Load Diff
2
mineru/model/docx/test2/word/endnotes.xml
Normal file
2
mineru/model/docx/test2/word/endnotes.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:endnotes xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:endnote w:type="separator" w:id="-1"><w:p w14:paraId="066749CB" w14:textId="77777777" w:rsidR="00DE3E16" w:rsidRDefault="00DE3E16"><w:pPr><w:spacing w:line="240" w:lineRule="auto"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:separator/></w:r></w:p></w:endnote><w:endnote w:type="continuationSeparator" w:id="0"><w:p w14:paraId="1D7CA5B8" w14:textId="77777777" w:rsidR="00DE3E16" w:rsidRDefault="00DE3E16"><w:pPr><w:spacing w:line="240" w:lineRule="auto"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:continuationSeparator/></w:r></w:p></w:endnote></w:endnotes>
|
||||
2
mineru/model/docx/test2/word/fontTable.xml
Normal file
2
mineru/model/docx/test2/word/fontTable.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:font w:name="Symbol"><w:panose1 w:val="05050102010706020507"/><w:charset w:val="02"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="00000000" w:usb1="10000000" w:usb2="00000000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/></w:font><w:font w:name="Times New Roman"><w:panose1 w:val="02020603050405020304"/><w:charset w:val="00"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="Arial"><w:panose1 w:val="020B0604020202020204"/><w:charset w:val="00"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="MS PGothic"><w:panose1 w:val="020B0600070205080204"/><w:charset w:val="80"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E00002FF" w:usb1="6AC7FDFB" w:usb2="08000012" w:usb3="00000000" w:csb0="0002009F" w:csb1="00000000"/></w:font><w:font w:name="Microsoft YaHei UI"><w:panose1 w:val="020B0503020204020204"/><w:charset w:val="86"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="80000287" w:usb1="2ACF3C50" w:usb2="00000016" w:usb3="00000000" w:csb0="0004001F" w:csb1="00000000"/></w:font><w:font w:name="黑体"><w:altName w:val="SimHei"/><w:panose1 w:val="02010609060101010101"/><w:charset w:val="86"/><w:family w:val="modern"/><w:pitch w:val="fixed"/><w:sig w:usb0="800002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="00040001" w:csb1="00000000"/></w:font></w:fonts>
|
||||
2
mineru/model/docx/test2/word/footer1.xml
Normal file
2
mineru/model/docx/test2/word/footer1.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:ftr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="71759E2F" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="af0"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:ftr>
|
||||
2
mineru/model/docx/test2/word/footer2.xml
Normal file
2
mineru/model/docx/test2/word/footer2.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:ftr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:sdt><w:sdtPr><w:id w:val="107940834"/><w:docPartObj><w:docPartGallery w:val="Page Numbers (Bottom of Page)"/><w:docPartUnique/></w:docPartObj></w:sdtPr><w:sdtEndPr><w:rPr><w:noProof/></w:rPr></w:sdtEndPr><w:sdtContent><w:p w14:paraId="34CD4779" w14:textId="77777777" w:rsidR="008049DB" w:rsidRDefault="006A3739"><w:pPr><w:pStyle w:val="af0"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:rPr><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:fldChar w:fldCharType="begin"/></w:r><w:r><w:rPr><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:instrText xml:space="preserve"> PAGE \* MERGEFORMAT </w:instrText></w:r><w:r><w:rPr><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:fldChar w:fldCharType="separate"/></w:r><w:r w:rsidR="00793AFB"><w:rPr><w:noProof/><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:t>2</w:t></w:r><w:r><w:rPr><w:noProof/><w:lang w:val="zh-CN" w:bidi="zh-CN"/></w:rPr><w:fldChar w:fldCharType="end"/></w:r></w:p></w:sdtContent></w:sdt></w:ftr>
|
||||
2
mineru/model/docx/test2/word/footer3.xml
Normal file
2
mineru/model/docx/test2/word/footer3.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:ftr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="06BA2A24" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="af0"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:ftr>
|
||||
2
mineru/model/docx/test2/word/footnotes.xml
Normal file
2
mineru/model/docx/test2/word/footnotes.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:footnotes xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:footnote w:type="separator" w:id="-1"><w:p w14:paraId="0D637794" w14:textId="77777777" w:rsidR="00DE3E16" w:rsidRDefault="00DE3E16"><w:pPr><w:spacing w:line="240" w:lineRule="auto"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:separator/></w:r></w:p></w:footnote><w:footnote w:type="continuationSeparator" w:id="0"><w:p w14:paraId="5D637BFB" w14:textId="77777777" w:rsidR="00DE3E16" w:rsidRDefault="00DE3E16"><w:pPr><w:spacing w:line="240" w:lineRule="auto"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr><w:r><w:continuationSeparator/></w:r></w:p></w:footnote></w:footnotes>
|
||||
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings" Target="webSettings.xml"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/><Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable" Target="fontTable.xml"/></Relationships>
|
||||
2
mineru/model/docx/test2/word/glossary/document.xml
Normal file
2
mineru/model/docx/test2/word/glossary/document.xml
Normal file
File diff suppressed because one or more lines are too long
2
mineru/model/docx/test2/word/glossary/fontTable.xml
Normal file
2
mineru/model/docx/test2/word/glossary/fontTable.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:font w:name="Symbol"><w:panose1 w:val="05050102010706020507"/><w:charset w:val="02"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="00000000" w:usb1="10000000" w:usb2="00000000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/></w:font><w:font w:name="Times New Roman"><w:panose1 w:val="02020603050405020304"/><w:charset w:val="00"/><w:family w:val="roman"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="Arial"><w:panose1 w:val="020B0604020202020204"/><w:charset w:val="00"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E0002EFF" w:usb1="C000785B" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/></w:font><w:font w:name="MS PGothic"><w:panose1 w:val="020B0600070205080204"/><w:charset w:val="80"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="E00002FF" w:usb1="6AC7FDFB" w:usb2="08000012" w:usb3="00000000" w:csb0="0002009F" w:csb1="00000000"/></w:font><w:font w:name="Microsoft YaHei UI"><w:panose1 w:val="020B0503020204020204"/><w:charset w:val="86"/><w:family w:val="swiss"/><w:pitch w:val="variable"/><w:sig w:usb0="80000287" w:usb1="2ACF3C50" w:usb2="00000016" w:usb3="00000000" w:csb0="0004001F" w:csb1="00000000"/></w:font><w:font w:name="黑体"><w:altName w:val="SimHei"/><w:panose1 w:val="02010609060101010101"/><w:charset w:val="86"/><w:family w:val="modern"/><w:pitch w:val="fixed"/><w:sig w:usb0="800002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="00040001" w:csb1="00000000"/></w:font><w:font w:name="等线"><w:altName w:val="DengXian"/><w:panose1 w:val="02010600030101010101"/><w:charset w:val="86"/><w:family w:val="auto"/><w:pitch w:val="variable"/><w:sig w:usb0="A00002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="0004000F" w:csb1="00000000"/></w:font><w:font w:name="等线 Light"><w:panose1 w:val="02010600030101010101"/><w:charset w:val="86"/><w:family w:val="auto"/><w:pitch w:val="variable"/><w:sig w:usb0="A00002BF" w:usb1="38CF7CFA" w:usb2="00000016" w:usb3="00000000" w:csb0="0004000F" w:csb1="00000000"/></w:font></w:fonts>
|
||||
2
mineru/model/docx/test2/word/glossary/settings.xml
Normal file
2
mineru/model/docx/test2/word/glossary/settings.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:settings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:sl="http://schemas.openxmlformats.org/schemaLibrary/2006/main" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:view w:val="normal"/><w:bordersDoNotSurroundHeader/><w:bordersDoNotSurroundFooter/><w:defaultTabStop w:val="420"/><w:drawingGridVerticalSpacing w:val="156"/><w:displayHorizontalDrawingGridEvery w:val="0"/><w:displayVerticalDrawingGridEvery w:val="2"/><w:characterSpacingControl w:val="compressPunctuation"/><w:compat><w:spaceForUL/><w:balanceSingleByteDoubleByteWidth/><w:doNotLeaveBackslashAlone/><w:ulTrailSpace/><w:doNotExpandShiftReturn/><w:adjustLineHeightInTable/><w:useFELayout/><w:compatSetting w:name="compatibilityMode" w:uri="http://schemas.microsoft.com/office/word" w:val="15"/><w:compatSetting w:name="overrideTableStyleFontSizeAndJustification" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="enableOpenTypeFeatures" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="doNotFlipMirrorIndents" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="differentiateMultirowTableHeaders" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="useWord2013TrackBottomHyphenation" w:uri="http://schemas.microsoft.com/office/word" w:val="0"/></w:compat><w:rsids><w:rsidRoot w:val="007012A4"/><w:rsid w:val="007012A4"/><w:rsid w:val="008049C9"/></w:rsids><m:mathPr><m:mathFont m:val="Cambria Math"/><m:brkBin m:val="before"/><m:brkBinSub m:val="--"/><m:smallFrac m:val="0"/><m:dispDef/><m:lMargin m:val="0"/><m:rMargin m:val="0"/><m:defJc m:val="centerGroup"/><m:wrapIndent m:val="1440"/><m:intLim m:val="subSup"/><m:naryLim m:val="undOvr"/></m:mathPr><w:themeFontLang w:val="en-US" w:eastAsia="zh-CN"/><w:clrSchemeMapping w:bg1="light1" w:t1="dark1" w:bg2="light2" w:t2="dark2" w:accent1="accent1" w:accent2="accent2" w:accent3="accent3" w:accent4="accent4" w:accent5="accent5" w:accent6="accent6" w:hyperlink="hyperlink" w:followedHyperlink="followedHyperlink"/><w:decimalSymbol w:val="."/><w:listSeparator w:val=","/><w15:chartTrackingRefBased/></w:settings>
|
||||
2
mineru/model/docx/test2/word/glossary/styles.xml
Normal file
2
mineru/model/docx/test2/word/glossary/styles.xml
Normal file
File diff suppressed because one or more lines are too long
2
mineru/model/docx/test2/word/glossary/webSettings.xml
Normal file
2
mineru/model/docx/test2/word/glossary/webSettings.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:webSettings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:optimizeForBrowser/><w:allowPNG/></w:webSettings>
|
||||
2
mineru/model/docx/test2/word/header1.xml
Normal file
2
mineru/model/docx/test2/word/header1.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="1CDCFD2F" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="ae"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:hdr>
|
||||
2
mineru/model/docx/test2/word/header2.xml
Normal file
2
mineru/model/docx/test2/word/header2.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="777F27D5" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="ae"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:hdr>
|
||||
2
mineru/model/docx/test2/word/header3.xml
Normal file
2
mineru/model/docx/test2/word/header3.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:oel="http://schemas.microsoft.com/office/2019/extlst" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du wp14"><w:p w14:paraId="0A1573FC" w14:textId="77777777" w:rsidR="00BB4862" w:rsidRDefault="00BB4862"><w:pPr><w:pStyle w:val="ae"/><w:rPr><w:rFonts w:hint="eastAsia"/></w:rPr></w:pPr></w:p></w:hdr>
|
||||
BIN
mineru/model/docx/test2/word/media/image1.jpeg
Normal file
BIN
mineru/model/docx/test2/word/media/image1.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 136 KiB |
2
mineru/model/docx/test2/word/numbering.xml
Normal file
2
mineru/model/docx/test2/word/numbering.xml
Normal file
File diff suppressed because one or more lines are too long
2
mineru/model/docx/test2/word/settings.xml
Normal file
2
mineru/model/docx/test2/word/settings.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:settings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:sl="http://schemas.openxmlformats.org/schemaLibrary/2006/main" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:zoom w:percent="100"/><w:bordersDoNotSurroundHeader/><w:bordersDoNotSurroundFooter/><w:proofState w:spelling="clean" w:grammar="clean"/><w:attachedTemplate r:id="rId1"/><w:defaultTabStop w:val="720"/><w:characterSpacingControl w:val="doNotCompress"/><w:hdrShapeDefaults><o:shapedefaults v:ext="edit" spidmax="2050"/></w:hdrShapeDefaults><w:footnotePr><w:footnote w:id="-1"/><w:footnote w:id="0"/></w:footnotePr><w:endnotePr><w:endnote w:id="-1"/><w:endnote w:id="0"/></w:endnotePr><w:compat><w:useFELayout/><w:compatSetting w:name="compatibilityMode" w:uri="http://schemas.microsoft.com/office/word" w:val="15"/><w:compatSetting w:name="overrideTableStyleFontSizeAndJustification" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="enableOpenTypeFeatures" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="doNotFlipMirrorIndents" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="differentiateMultirowTableHeaders" w:uri="http://schemas.microsoft.com/office/word" w:val="1"/><w:compatSetting w:name="useWord2013TrackBottomHyphenation" w:uri="http://schemas.microsoft.com/office/word" w:val="0"/></w:compat><w:rsids><w:rsidRoot w:val="00365A38"/><w:rsid w:val="00055AF8"/><w:rsid w:val="001365E4"/><w:rsid w:val="002558FA"/><w:rsid w:val="00323F56"/><w:rsid w:val="00365A38"/><w:rsid w:val="003667F4"/><w:rsid w:val="00616194"/><w:rsid w:val="006924B3"/><w:rsid w:val="006A3739"/><w:rsid w:val="007577D4"/><w:rsid w:val="00793AFB"/><w:rsid w:val="007D3668"/><w:rsid w:val="008049C9"/><w:rsid w:val="008049DB"/><w:rsid w:val="00837ECD"/><w:rsid w:val="00897AA4"/><w:rsid w:val="00907574"/><w:rsid w:val="00934F6F"/><w:rsid w:val="00966901"/><w:rsid w:val="00981A82"/><w:rsid w:val="00A93410"/><w:rsid w:val="00B76A92"/><w:rsid w:val="00BB4862"/><w:rsid w:val="00BF2506"/><w:rsid w:val="00C3067E"/><w:rsid w:val="00C5328D"/><w:rsid w:val="00CE7F7E"/><w:rsid w:val="00CF07F2"/><w:rsid w:val="00D934CD"/><w:rsid w:val="00DE3E16"/><w:rsid w:val="00E4324B"/><w:rsid w:val="00E9657B"/><w:rsid w:val="00F011A8"/></w:rsids><m:mathPr><m:mathFont m:val="Cambria Math"/><m:brkBin m:val="before"/><m:brkBinSub m:val="--"/><m:smallFrac m:val="0"/><m:dispDef/><m:lMargin m:val="0"/><m:rMargin m:val="0"/><m:defJc m:val="centerGroup"/><m:wrapIndent m:val="1440"/><m:intLim m:val="subSup"/><m:naryLim m:val="undOvr"/></m:mathPr><w:themeFontLang w:val="en-US" w:eastAsia="ja-JP" w:bidi="ar-SA"/><w:clrSchemeMapping w:bg1="light1" w:t1="dark1" w:bg2="light2" w:t2="dark2" w:accent1="accent1" w:accent2="accent2" w:accent3="accent3" w:accent4="accent4" w:accent5="accent5" w:accent6="accent6" w:hyperlink="hyperlink" w:followedHyperlink="followedHyperlink"/><w:shapeDefaults><o:shapedefaults v:ext="edit" spidmax="2050"/><o:shapelayout v:ext="edit"><o:idmap v:ext="edit" data="2"/></o:shapelayout></w:shapeDefaults><w:decimalSymbol w:val="."/><w:listSeparator w:val=","/><w14:docId w14:val="79E9351B"/><w15:chartTrackingRefBased/><w15:docId w15:val="{53785501-C96A-4E1A-AEE6-3D202E3FAC5F}"/></w:settings>
|
||||
2
mineru/model/docx/test2/word/styles.xml
Normal file
2
mineru/model/docx/test2/word/styles.xml
Normal file
File diff suppressed because one or more lines are too long
2
mineru/model/docx/test2/word/theme/theme1.xml
Normal file
2
mineru/model/docx/test2/word/theme/theme1.xml
Normal file
File diff suppressed because one or more lines are too long
2
mineru/model/docx/test2/word/webSettings.xml
Normal file
2
mineru/model/docx/test2/word/webSettings.xml
Normal file
@@ -0,0 +1,2 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:webSettings xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16du="http://schemas.microsoft.com/office/word/2023/wordml/word16du" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16sdtfl="http://schemas.microsoft.com/office/word/2024/wordml/sdtformatlock" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" mc:Ignorable="w14 w15 w16se w16cid w16 w16cex w16sdtdh w16sdtfl w16du"><w:optimizeForBrowser/><w:allowPNG/></w:webSettings>
|
||||
BIN
mineru/model/docx/textbox.docx
Normal file
BIN
mineru/model/docx/textbox.docx
Normal file
Binary file not shown.
1
mineru/model/docx/tools/__init__.py
Normal file
1
mineru/model/docx/tools/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Copyright (c) Opendatalab. All rights reserved.
|
||||
0
mineru/model/docx/tools/math/__init__.py
Normal file
0
mineru/model/docx/tools/math/__init__.py
Normal file
274
mineru/model/docx/tools/math/latex_dict.py
Normal file
274
mineru/model/docx/tools/math/latex_dict.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||
On 23/01/2025
|
||||
"""
|
||||
|
||||
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
||||
|
||||
BLANK = ""
|
||||
BACKSLASH = "\\"
|
||||
ALN = "&"
|
||||
|
||||
CHR = {
|
||||
# Unicode : Latex Math Symbols
|
||||
# Top accents
|
||||
"\u0300": "\\grave{{{0}}}",
|
||||
"\u0301": "\\acute{{{0}}}",
|
||||
"\u0302": "\\hat{{{0}}}",
|
||||
"\u0303": "\\tilde{{{0}}}",
|
||||
"\u0304": "\\bar{{{0}}}",
|
||||
"\u0305": "\\overbar{{{0}}}",
|
||||
"\u0306": "\\breve{{{0}}}",
|
||||
"\u0307": "\\dot{{{0}}}",
|
||||
"\u0308": "\\ddot{{{0}}}",
|
||||
"\u0309": "\\ovhook{{{0}}}",
|
||||
"\u030a": "\\ocirc{{{0}}}}",
|
||||
"\u030c": "\\check{{{0}}}}",
|
||||
"\u0310": "\\candra{{{0}}}",
|
||||
"\u0312": "\\oturnedcomma{{{0}}}",
|
||||
"\u0315": "\\ocommatopright{{{0}}}",
|
||||
"\u031a": "\\droang{{{0}}}",
|
||||
"\u0338": "\\not{{{0}}}",
|
||||
"\u20d0": "\\leftharpoonaccent{{{0}}}",
|
||||
"\u20d1": "\\rightharpoonaccent{{{0}}}",
|
||||
"\u20d2": "\\vertoverlay{{{0}}}",
|
||||
"\u20d6": "\\overleftarrow{{{0}}}",
|
||||
"\u20d7": "\\vec{{{0}}}",
|
||||
"\u20db": "\\dddot{{{0}}}",
|
||||
"\u20dc": "\\ddddot{{{0}}}",
|
||||
"\u20e1": "\\overleftrightarrow{{{0}}}",
|
||||
"\u20e7": "\\annuity{{{0}}}",
|
||||
"\u20e9": "\\widebridgeabove{{{0}}}",
|
||||
"\u20f0": "\\asteraccent{{{0}}}",
|
||||
# Bottom accents
|
||||
"\u0330": "\\wideutilde{{{0}}}",
|
||||
"\u0331": "\\underbar{{{0}}}",
|
||||
"\u20e8": "\\threeunderdot{{{0}}}",
|
||||
"\u20ec": "\\underrightharpoondown{{{0}}}",
|
||||
"\u20ed": "\\underleftharpoondown{{{0}}}",
|
||||
"\u20ee": "\\underledtarrow{{{0}}}",
|
||||
"\u20ef": "\\underrightarrow{{{0}}}",
|
||||
# Over | group
|
||||
"\u23b4": "\\overbracket{{{0}}}",
|
||||
"\u23dc": "\\overparen{{{0}}}",
|
||||
"\u23de": "\\overbrace{{{0}}}",
|
||||
# Under| group
|
||||
"\u23b5": "\\underbracket{{{0}}}",
|
||||
"\u23dd": "\\underparen{{{0}}}",
|
||||
"\u23df": "\\underbrace{{{0}}}",
|
||||
}
|
||||
|
||||
CHR_BO = {
|
||||
# Big operators,
|
||||
"\u2140": "\\Bbbsum",
|
||||
"\u220f": "\\prod",
|
||||
"\u2210": "\\coprod",
|
||||
"\u2211": "\\sum",
|
||||
"\u222b": "\\int",
|
||||
"\u222c": "\\iint",
|
||||
"\u222d": "\\iiint",
|
||||
"\u222e": "\\oint",
|
||||
"\u222f": "\\oiint",
|
||||
"\u2230": "\\oiiint",
|
||||
"\u22c0": "\\bigwedge",
|
||||
"\u22c1": "\\bigvee",
|
||||
"\u22c2": "\\bigcap",
|
||||
"\u22c3": "\\bigcup",
|
||||
"\u2a00": "\\bigodot",
|
||||
"\u2a01": "\\bigoplus",
|
||||
"\u2a02": "\\bigotimes",
|
||||
}
|
||||
|
||||
T = {
|
||||
# Greek letters
|
||||
"\U0001d6fc": "\\alpha ",
|
||||
"\U0001d6fd": "\\beta ",
|
||||
"\U0001d6fe": "\\gamma ",
|
||||
"\U0001d6ff": "\\theta ",
|
||||
"\U0001d700": "\\epsilon ",
|
||||
"\U0001d701": "\\zeta ",
|
||||
"\U0001d702": "\\eta ",
|
||||
"\U0001d703": "\\theta ",
|
||||
"\U0001d704": "\\iota ",
|
||||
"\U0001d705": "\\kappa ",
|
||||
"\U0001d706": "\\lambda ",
|
||||
"\U0001d707": "\\m ",
|
||||
"\U0001d708": "\\n ",
|
||||
"\U0001d709": "\\xi ",
|
||||
"\U0001d70a": "\\omicron ",
|
||||
"\U0001d70b": "\\pi ",
|
||||
"\U0001d70c": "\\rho ",
|
||||
"\U0001d70d": "\\varsigma ",
|
||||
"\U0001d70e": "\\sigma ",
|
||||
"\U0001d70f": "\\ta ",
|
||||
"\U0001d710": "\\upsilon ",
|
||||
"\U0001d711": "\\phi ",
|
||||
"\U0001d712": "\\chi ",
|
||||
"\U0001d713": "\\psi ",
|
||||
"\U0001d714": "\\omega ",
|
||||
"\U0001d715": "\\partial ",
|
||||
"\U0001d716": "\\varepsilon ",
|
||||
"\U0001d717": "\\vartheta ",
|
||||
"\U0001d718": "\\varkappa ",
|
||||
"\U0001d719": "\\varphi ",
|
||||
"\U0001d71a": "\\varrho ",
|
||||
"\U0001d71b": "\\varpi ",
|
||||
# Relation symbols
|
||||
"\u2190": "\\leftarrow ",
|
||||
"\u2191": "\\uparrow ",
|
||||
"\u2192": "\\rightarrow ",
|
||||
"\u2193": "\\downright ",
|
||||
"\u2194": "\\leftrightarrow ",
|
||||
"\u2195": "\\updownarrow ",
|
||||
"\u2196": "\\nwarrow ",
|
||||
"\u2197": "\\nearrow ",
|
||||
"\u2198": "\\searrow ",
|
||||
"\u2199": "\\swarrow ",
|
||||
"\u22ee": "\\vdots ",
|
||||
"\u22ef": "\\cdots ",
|
||||
"\u22f0": "\\adots ",
|
||||
"\u22f1": "\\ddots ",
|
||||
"\u2260": "\\ne ",
|
||||
"\u2264": "\\leq ",
|
||||
"\u2265": "\\geq ",
|
||||
"\u2266": "\\leqq ",
|
||||
"\u2267": "\\geqq ",
|
||||
"\u2268": "\\lneqq ",
|
||||
"\u2269": "\\gneqq ",
|
||||
"\u226a": "\\ll ",
|
||||
"\u226b": "\\gg ",
|
||||
"\u2208": "\\in ",
|
||||
"\u2209": "\\notin ",
|
||||
"\u220b": "\\ni ",
|
||||
"\u220c": "\\nni ",
|
||||
# Ordinary symbols
|
||||
"\u221e": "\\infty ",
|
||||
# Binary relations
|
||||
"\u00b1": "\\pm ",
|
||||
"\u2213": "\\mp ",
|
||||
# Italic, Latin, uppercase
|
||||
"\U0001d434": "A",
|
||||
"\U0001d435": "B",
|
||||
"\U0001d436": "C",
|
||||
"\U0001d437": "D",
|
||||
"\U0001d438": "E",
|
||||
"\U0001d439": "F",
|
||||
"\U0001d43a": "G",
|
||||
"\U0001d43b": "H",
|
||||
"\U0001d43c": "I",
|
||||
"\U0001d43d": "J",
|
||||
"\U0001d43e": "K",
|
||||
"\U0001d43f": "L",
|
||||
"\U0001d440": "M",
|
||||
"\U0001d441": "N",
|
||||
"\U0001d442": "O",
|
||||
"\U0001d443": "P",
|
||||
"\U0001d444": "Q",
|
||||
"\U0001d445": "R",
|
||||
"\U0001d446": "S",
|
||||
"\U0001d447": "T",
|
||||
"\U0001d448": "U",
|
||||
"\U0001d449": "V",
|
||||
"\U0001d44a": "W",
|
||||
"\U0001d44b": "X",
|
||||
"\U0001d44c": "Y",
|
||||
"\U0001d44d": "Z",
|
||||
# Italic, Latin, lowercase
|
||||
"\U0001d44e": "a",
|
||||
"\U0001d44f": "b",
|
||||
"\U0001d450": "c",
|
||||
"\U0001d451": "d",
|
||||
"\U0001d452": "e",
|
||||
"\U0001d453": "f",
|
||||
"\U0001d454": "g",
|
||||
"\U0001d456": "i",
|
||||
"\U0001d457": "j",
|
||||
"\U0001d458": "k",
|
||||
"\U0001d459": "l",
|
||||
"\U0001d45a": "m",
|
||||
"\U0001d45b": "n",
|
||||
"\U0001d45c": "o",
|
||||
"\U0001d45d": "p",
|
||||
"\U0001d45e": "q",
|
||||
"\U0001d45f": "r",
|
||||
"\U0001d460": "s",
|
||||
"\U0001d461": "t",
|
||||
"\U0001d462": "u",
|
||||
"\U0001d463": "v",
|
||||
"\U0001d464": "w",
|
||||
"\U0001d465": "x",
|
||||
"\U0001d466": "y",
|
||||
"\U0001d467": "z",
|
||||
}
|
||||
|
||||
FUNC = {
|
||||
"sin": "\\sin({fe})",
|
||||
"cos": "\\cos({fe})",
|
||||
"tan": "\\tan({fe})",
|
||||
"arcsin": "\\arcsin({fe})",
|
||||
"arccos": "\\arccos({fe})",
|
||||
"arctan": "\\arctan({fe})",
|
||||
"arccot": "\\arccot({fe})",
|
||||
"sinh": "\\sinh({fe})",
|
||||
"cosh": "\\cosh({fe})",
|
||||
"tanh": "\\tanh({fe})",
|
||||
"coth": "\\coth({fe})",
|
||||
"sec": "\\sec({fe})",
|
||||
"csc": "\\csc({fe})",
|
||||
"mod": "\\mod {fe}",
|
||||
"max": "\\max({fe})",
|
||||
"min": "\\min({fe})",
|
||||
}
|
||||
|
||||
FUNC_PLACE = "{fe}"
|
||||
|
||||
BRK = "\\\\"
|
||||
|
||||
CHR_DEFAULT = {
|
||||
"ACC_VAL": "\\hat{{{0}}}",
|
||||
}
|
||||
|
||||
POS = {
|
||||
"top": "\\overline{{{0}}}", # not sure
|
||||
"bot": "\\underline{{{0}}}",
|
||||
}
|
||||
|
||||
POS_DEFAULT = {
|
||||
"BAR_VAL": "\\overline{{{0}}}",
|
||||
}
|
||||
|
||||
SUB = "_{{{0}}}"
|
||||
|
||||
SUP = "^{{{0}}}"
|
||||
|
||||
F = {
|
||||
"bar": "\\frac{{{num}}}{{{den}}}",
|
||||
"skw": r"^{{{num}}}/_{{{den}}}",
|
||||
"noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}",
|
||||
"lin": "{{{num}}}/{{{den}}}",
|
||||
}
|
||||
F_DEFAULT = "\\frac{{{num}}}{{{den}}}"
|
||||
|
||||
D = "\\left{left}{text}\\right{right}"
|
||||
|
||||
D_DEFAULT = {
|
||||
"left": "(",
|
||||
"right": ")",
|
||||
"null": ".",
|
||||
}
|
||||
|
||||
RAD = "\\sqrt[{deg}]{{{text}}}"
|
||||
RAD_DEFAULT = "\\sqrt{{{text}}}"
|
||||
ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}"
|
||||
|
||||
LIM_FUNC = {
|
||||
"lim": "\\lim_{{{lim}}}",
|
||||
"max": "\\max_{{{lim}}}",
|
||||
"min": "\\min_{{{lim}}}",
|
||||
}
|
||||
|
||||
LIM_TO = ("\\rightarrow", "\\to")
|
||||
|
||||
LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
|
||||
|
||||
M = "\\begin{{matrix}}{text}\\end{{matrix}}"
|
||||
455
mineru/model/docx/tools/math/omml.py
Normal file
455
mineru/model/docx/tools/math/omml.py
Normal file
@@ -0,0 +1,455 @@
|
||||
"""
|
||||
Office Math Markup Language (OMML)
|
||||
|
||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
||||
On 23/01/2025
|
||||
"""
|
||||
import lxml.etree as ET
|
||||
from loguru import logger
|
||||
from pylatexenc.latexencode import UnicodeToLatexEncoder
|
||||
|
||||
from .latex_dict import (
|
||||
ALN,
|
||||
ARR,
|
||||
BACKSLASH,
|
||||
BLANK,
|
||||
BRK,
|
||||
CHARS,
|
||||
CHR,
|
||||
CHR_BO,
|
||||
CHR_DEFAULT,
|
||||
D_DEFAULT,
|
||||
F_DEFAULT,
|
||||
FUNC,
|
||||
FUNC_PLACE,
|
||||
LIM_FUNC,
|
||||
LIM_TO,
|
||||
LIM_UPP,
|
||||
POS,
|
||||
POS_DEFAULT,
|
||||
RAD,
|
||||
RAD_DEFAULT,
|
||||
SUB,
|
||||
SUP,
|
||||
D,
|
||||
F,
|
||||
M,
|
||||
T,
|
||||
)
|
||||
|
||||
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
||||
|
||||
|
||||
def load(stream):
|
||||
tree = ET.parse(stream)
|
||||
for omath in tree.findall(OMML_NS + "oMath"):
|
||||
yield oMath2Latex(omath)
|
||||
|
||||
|
||||
def load_string(string):
|
||||
root = ET.fromstring(string)
|
||||
for omath in root.findall(OMML_NS + "oMath"):
|
||||
yield oMath2Latex(omath)
|
||||
|
||||
|
||||
def escape_latex(strs):
|
||||
last = None
|
||||
new_chr = []
|
||||
strs = strs.replace(r"\\", "\\")
|
||||
for c in strs:
|
||||
if (c in CHARS) and (last != BACKSLASH):
|
||||
new_chr.append(BACKSLASH + c)
|
||||
else:
|
||||
new_chr.append(c)
|
||||
last = c
|
||||
return BLANK.join(new_chr)
|
||||
|
||||
|
||||
def get_val(key, default=None, store=CHR):
|
||||
if key is not None:
|
||||
return key if not store else store.get(key, key)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
||||
class Tag2Method:
|
||||
def call_method(self, elm, stag=None):
|
||||
getmethod = self.tag2meth.get
|
||||
if stag is None:
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
method = getmethod(stag)
|
||||
if method:
|
||||
return method(self, elm)
|
||||
else:
|
||||
return None
|
||||
|
||||
def process_children_list(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return iterable
|
||||
"""
|
||||
for _e in list(elm):
|
||||
if OMML_NS not in _e.tag:
|
||||
continue
|
||||
stag = _e.tag.replace(OMML_NS, "")
|
||||
if include and (stag not in include):
|
||||
continue
|
||||
t = self.call_method(_e, stag=stag)
|
||||
if t is None:
|
||||
t = self.process_unknow(_e, stag)
|
||||
if t is None:
|
||||
continue
|
||||
yield (stag, t, _e)
|
||||
|
||||
def process_children_dict(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return dict
|
||||
"""
|
||||
latex_chars = dict()
|
||||
for stag, t, e in self.process_children_list(elm, include):
|
||||
latex_chars[stag] = t
|
||||
return latex_chars
|
||||
|
||||
def process_children(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return string
|
||||
"""
|
||||
return BLANK.join(
|
||||
(
|
||||
t if not isinstance(t, Tag2Method) else str(t)
|
||||
for stag, t, e in self.process_children_list(elm, include)
|
||||
)
|
||||
)
|
||||
|
||||
def process_unknow(self, elm, stag):
|
||||
return None
|
||||
|
||||
|
||||
class Pr(Tag2Method):
|
||||
text = ""
|
||||
|
||||
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
||||
|
||||
__innerdict = None # can't use the __dict__
|
||||
|
||||
""" common properties of element"""
|
||||
|
||||
def __init__(self, elm):
|
||||
self.__innerdict = {}
|
||||
self.text = self.process_children(elm)
|
||||
|
||||
def __str__(self):
|
||||
return self.text
|
||||
|
||||
def __unicode__(self):
|
||||
return self.__str__(self)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return self.__innerdict.get(name, None)
|
||||
|
||||
def do_brk(self, elm):
|
||||
self.__innerdict["brk"] = BRK
|
||||
return BRK
|
||||
|
||||
def do_common(self, elm):
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
if stag in self.__val_tags:
|
||||
t = elm.get(f"{OMML_NS}val")
|
||||
self.__innerdict[stag] = t
|
||||
return None
|
||||
|
||||
tag2meth = {
|
||||
"brk": do_brk,
|
||||
"chr": do_common,
|
||||
"pos": do_common,
|
||||
"begChr": do_common,
|
||||
"endChr": do_common,
|
||||
"type": do_common,
|
||||
}
|
||||
|
||||
|
||||
class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
Convert oMath element of omml to latex
|
||||
"""
|
||||
|
||||
_t_dict = T
|
||||
|
||||
__direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
|
||||
u = UnicodeToLatexEncoder(
|
||||
replacement_latex_protection="braces-all",
|
||||
unknown_char_policy="keep",
|
||||
unknown_char_warning=False,
|
||||
)
|
||||
|
||||
def __init__(self, element):
|
||||
self._latex = self.process_children(element)
|
||||
|
||||
def __str__(self):
|
||||
return self.latex.replace(" ", " ")
|
||||
|
||||
def __unicode__(self):
|
||||
return self.__str__(self)
|
||||
|
||||
def process_unknow(self, elm, stag):
|
||||
if stag in self.__direct_tags:
|
||||
return self.process_children(elm)
|
||||
elif stag[-2:] == "Pr":
|
||||
return Pr(elm)
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def latex(self):
|
||||
return self._latex
|
||||
|
||||
def do_acc(self, elm):
|
||||
"""
|
||||
the accent function
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
latex_s = get_val(
|
||||
c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
|
||||
)
|
||||
return latex_s.format(c_dict["e"])
|
||||
|
||||
def do_bar(self, elm):
|
||||
"""
|
||||
the bar function
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["barPr"]
|
||||
latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
|
||||
return pr.text + latex_s.format(c_dict["e"])
|
||||
|
||||
def do_d(self, elm):
|
||||
"""
|
||||
the delimiter object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["dPr"]
|
||||
null = D_DEFAULT.get("null")
|
||||
|
||||
s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
|
||||
e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
|
||||
delim = pr.text + D.format(
|
||||
left=null if not s_val else escape_latex(s_val),
|
||||
text=c_dict["e"],
|
||||
right=null if not e_val else escape_latex(e_val),
|
||||
)
|
||||
return delim
|
||||
|
||||
def do_spre(self, elm):
|
||||
"""
|
||||
the Pre-Sub-Superscript object -- Not support yet
|
||||
"""
|
||||
|
||||
def do_sub(self, elm):
|
||||
text = self.process_children(elm)
|
||||
return SUB.format(text)
|
||||
|
||||
def do_sup(self, elm):
|
||||
text = self.process_children(elm)
|
||||
return SUP.format(text)
|
||||
|
||||
def do_f(self, elm):
|
||||
"""
|
||||
the fraction object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict.get("fPr")
|
||||
if pr is None:
|
||||
# Handle missing fPr element gracefully
|
||||
logger.debug("Missing fPr element in fraction, using default formatting")
|
||||
latex_s = F_DEFAULT
|
||||
return latex_s.format(
|
||||
num=c_dict.get("num"),
|
||||
den=c_dict.get("den"),
|
||||
)
|
||||
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
|
||||
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
|
||||
|
||||
def do_func(self, elm):
|
||||
"""
|
||||
the Function-Apply object (Examples:sin cos)
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
func_name = c_dict.get("fName")
|
||||
return func_name.replace(FUNC_PLACE, c_dict.get("e"))
|
||||
|
||||
def do_fname(self, elm):
|
||||
"""
|
||||
the func name
|
||||
"""
|
||||
latex_chars = []
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "r":
|
||||
if FUNC.get(t):
|
||||
latex_chars.append(FUNC[t])
|
||||
else:
|
||||
logger.warning("Function not supported, will default to text: %s", t)
|
||||
if isinstance(t, str):
|
||||
latex_chars.append(t)
|
||||
elif isinstance(t, str):
|
||||
latex_chars.append(t)
|
||||
t = BLANK.join(latex_chars)
|
||||
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
||||
|
||||
def do_groupchr(self, elm):
|
||||
"""
|
||||
the Group-Character object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["groupChrPr"]
|
||||
latex_s = get_val(pr.chr)
|
||||
return pr.text + latex_s.format(c_dict["e"])
|
||||
|
||||
def do_rad(self, elm):
|
||||
"""
|
||||
the radical object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
text = c_dict.get("e")
|
||||
deg_text = c_dict.get("deg")
|
||||
if deg_text:
|
||||
return RAD.format(deg=deg_text, text=text)
|
||||
else:
|
||||
return RAD_DEFAULT.format(text=text)
|
||||
|
||||
def do_eqarr(self, elm):
|
||||
"""
|
||||
the Array object
|
||||
"""
|
||||
return ARR.format(
|
||||
text=BRK.join(
|
||||
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
||||
)
|
||||
)
|
||||
|
||||
def do_limlow(self, elm):
|
||||
"""
|
||||
the Lower-Limit object
|
||||
"""
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||
if not latex_s:
|
||||
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
|
||||
else:
|
||||
return latex_s.format(lim=t_dict.get("lim"))
|
||||
|
||||
def do_limupp(self, elm):
|
||||
"""
|
||||
the Upper-Limit object
|
||||
"""
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
|
||||
|
||||
def do_lim(self, elm):
|
||||
"""
|
||||
the lower limit of the limLow object and the upper limit of the limUpp function
|
||||
"""
|
||||
return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
|
||||
|
||||
def do_m(self, elm):
|
||||
"""
|
||||
the Matrix object
|
||||
"""
|
||||
rows = []
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "mPr":
|
||||
pass
|
||||
elif stag == "mr":
|
||||
rows.append(t)
|
||||
return M.format(text=BRK.join(rows))
|
||||
|
||||
def do_mr(self, elm):
|
||||
"""
|
||||
a single row of the matrix m
|
||||
"""
|
||||
return ALN.join(
|
||||
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
||||
)
|
||||
|
||||
def do_nary(self, elm):
|
||||
"""
|
||||
the n-ary object
|
||||
"""
|
||||
res = []
|
||||
bo = ""
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "naryPr":
|
||||
# if <m:naryPr> contains no <m:chr>, the n-ary represents an integral
|
||||
bo = get_val(t.chr, default="\\int", store=CHR_BO)
|
||||
else:
|
||||
res.append(t)
|
||||
return bo + BLANK.join(res)
|
||||
|
||||
def process_unicode(self, s):
|
||||
# s = s if isinstance(s,unicode) else unicode(s,'utf-8')
|
||||
# print(s, self._t_dict.get(s, s), unicode_to_latex(s))
|
||||
# _str.append( self._t_dict.get(s, s) )
|
||||
|
||||
out_latex_str = self.u.unicode_to_latex(s)
|
||||
|
||||
if (
|
||||
s.startswith("{") is False
|
||||
and out_latex_str.startswith("{")
|
||||
and s.endswith("}") is False
|
||||
and out_latex_str.endswith("}")
|
||||
):
|
||||
out_latex_str = f" {out_latex_str[1:-1]} "
|
||||
|
||||
if "ensuremath" in out_latex_str:
|
||||
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
|
||||
out_latex_str = out_latex_str.replace("}", " ")
|
||||
|
||||
if out_latex_str.strip().startswith("\\text"):
|
||||
out_latex_str = f" \\text{{{out_latex_str}}} "
|
||||
|
||||
return out_latex_str
|
||||
|
||||
def do_r(self, elm):
|
||||
"""
|
||||
Get text from 'r' element,And try convert them to latex symbols
|
||||
@todo text style support , (sty)
|
||||
@todo \text (latex pure text support)
|
||||
"""
|
||||
_str = []
|
||||
_base_str = []
|
||||
found_text = elm.findtext(f"./{OMML_NS}t")
|
||||
if found_text:
|
||||
for s in found_text:
|
||||
out_latex_str = self.process_unicode(s)
|
||||
_str.append(out_latex_str)
|
||||
_base_str.append(s)
|
||||
|
||||
proc_str = escape_latex(BLANK.join(_str))
|
||||
base_proc_str = BLANK.join(_base_str)
|
||||
|
||||
if "{" not in base_proc_str and "\\{" in proc_str:
|
||||
proc_str = proc_str.replace("\\{", "{")
|
||||
|
||||
if "}" not in base_proc_str and "\\}" in proc_str:
|
||||
proc_str = proc_str.replace("\\}", "}")
|
||||
|
||||
return proc_str
|
||||
|
||||
tag2meth = {
|
||||
"acc": do_acc,
|
||||
"r": do_r,
|
||||
"bar": do_bar,
|
||||
"sub": do_sub,
|
||||
"sup": do_sup,
|
||||
"f": do_f,
|
||||
"func": do_func,
|
||||
"fName": do_fname,
|
||||
"groupChr": do_groupchr,
|
||||
"d": do_d,
|
||||
"rad": do_rad,
|
||||
"eqArr": do_eqarr,
|
||||
"limLow": do_limlow,
|
||||
"limUpp": do_limupp,
|
||||
"lim": do_lim,
|
||||
"m": do_m,
|
||||
"mr": do_mr,
|
||||
"nary": do_nary,
|
||||
}
|
||||
55
mineru/model/docx/tools/office_xml.py
Normal file
55
mineru/model/docx/tools/office_xml.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import xml.dom.minidom
|
||||
|
||||
from mammoth.docx.xmlparser import XmlText, XmlElement
|
||||
from mammoth.docx.office_xml import _collapse_alternate_content, _namespaces
|
||||
|
||||
|
||||
def parse_xml_str(xml_str, namespace_mapping=None):
|
||||
if namespace_mapping is None:
|
||||
namespace_prefixes = {}
|
||||
else:
|
||||
namespace_prefixes = dict((uri, prefix) for prefix, uri in namespace_mapping)
|
||||
|
||||
document = xml.dom.minidom.parseString(xml_str)
|
||||
|
||||
def convert_node(node):
|
||||
if node.nodeType == xml.dom.Node.ELEMENT_NODE:
|
||||
return convert_element(node)
|
||||
elif node.nodeType == xml.dom.Node.TEXT_NODE:
|
||||
return XmlText(node.nodeValue)
|
||||
else:
|
||||
return None
|
||||
|
||||
def convert_element(element):
|
||||
converted_name = convert_name(element)
|
||||
|
||||
converted_attributes = dict(
|
||||
(convert_name(attribute), attribute.value)
|
||||
for attribute in element.attributes.values()
|
||||
if attribute.namespaceURI != "http://www.w3.org/2000/xmlns/"
|
||||
)
|
||||
|
||||
converted_children = []
|
||||
for child_node in element.childNodes:
|
||||
converted_child_node = convert_node(child_node)
|
||||
if converted_child_node is not None:
|
||||
converted_children.append(converted_child_node)
|
||||
|
||||
return XmlElement(converted_name, converted_attributes, converted_children)
|
||||
|
||||
def convert_name(node):
|
||||
if node.namespaceURI is None:
|
||||
return node.localName
|
||||
else:
|
||||
prefix = namespace_prefixes.get(node.namespaceURI)
|
||||
if prefix is None:
|
||||
return "{%s}%s" % (node.namespaceURI, node.localName)
|
||||
else:
|
||||
return "%s:%s" % (prefix, node.localName)
|
||||
|
||||
return convert_node(document.documentElement)
|
||||
|
||||
|
||||
def read_str(xml_str):
|
||||
i = parse_xml_str(xml_str, _namespaces)
|
||||
return _collapse_alternate_content(i)[0]
|
||||
0
mineru/model/pptx/__init__.py
Normal file
0
mineru/model/pptx/__init__.py
Normal file
18
mineru/model/pptx/main.py
Normal file
18
mineru/model/pptx/main.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from typing import BinaryIO
|
||||
|
||||
from mineru.model.pptx.pptx_converter import PptxConverter
|
||||
|
||||
|
||||
def convert_path(file_path: str):
|
||||
with open(file_path, "rb") as fh:
|
||||
return convert_binary(fh)
|
||||
|
||||
|
||||
def convert_binary(file_binary: BinaryIO):
|
||||
converter = PptxConverter()
|
||||
converter.convert(file_binary)
|
||||
return converter.pages
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(convert_path("powerpoint_sample.pptx"))
|
||||
BIN
mineru/model/pptx/powerpoint_sample.pptx
Normal file
BIN
mineru/model/pptx/powerpoint_sample.pptx
Normal file
Binary file not shown.
610
mineru/model/pptx/pptx_converter.py
Normal file
610
mineru/model/pptx/pptx_converter.py
Normal file
@@ -0,0 +1,610 @@
|
||||
from io import BytesIO
|
||||
from typing import Final, BinaryIO, Optional
|
||||
|
||||
from lxml import etree
|
||||
from pptx import Presentation, presentation
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||
from pptx.oxml.text import CT_TextLineBreak
|
||||
from loguru import logger
|
||||
from PIL import Image, UnidentifiedImageError, WmfImagePlugin
|
||||
|
||||
from mineru.utils.enum_class import BlockType
|
||||
from mineru.utils.pdf_reader import image_to_b64str
|
||||
|
||||
|
||||
class PptxConverter:
|
||||
|
||||
def __init__(self):
|
||||
self.namespaces = {
|
||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
||||
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
|
||||
}
|
||||
self.file_stream = None
|
||||
self.pptx_obj = None
|
||||
self.pages = []
|
||||
self.cur_page = []
|
||||
self.list_block_stack: list = [] # 列表块堆栈
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
):
|
||||
self.file_stream = file_stream
|
||||
self.pptx_obj = Presentation(self.file_stream)
|
||||
self.pages.append(self.cur_page)
|
||||
if self.pptx_obj:
|
||||
self._walk_linear(self.pptx_obj)
|
||||
if self.pages[-1] == []:
|
||||
self.pages.pop()
|
||||
|
||||
def _walk_linear(self, pptx_obj: presentation.Presentation):
|
||||
# 遍历每一张幻灯片
|
||||
for _, slide in enumerate(pptx_obj.slides):
|
||||
|
||||
def handle_shapes(shape):
|
||||
handle_groups(shape)
|
||||
if shape.has_table:
|
||||
# 处理表格
|
||||
self._handle_tables(shape)
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||
# 处理图片
|
||||
if hasattr(shape, "image"):
|
||||
self._handle_pictures(shape)
|
||||
# 如果形状没有任何文本,则继续处理下一个形状
|
||||
if not hasattr(shape, "text"):
|
||||
return
|
||||
if shape.text is None:
|
||||
return
|
||||
if len(shape.text.strip()) == 0:
|
||||
return
|
||||
if not shape.has_text_frame:
|
||||
logger.warning("Warning: shape has text but not text_frame")
|
||||
return
|
||||
# 处理其他文本元素,包括列表(项目符号列表、编号列表等)
|
||||
self._handle_text_elements(shape)
|
||||
return
|
||||
|
||||
def handle_groups(shape):
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||
for groupedshape in shape.shapes:
|
||||
handle_shapes(groupedshape)
|
||||
|
||||
# 遍历幻灯片中的每一个形状
|
||||
for shape in slide.shapes:
|
||||
handle_shapes(shape)
|
||||
|
||||
self.cur_page = []
|
||||
self.pages.append(self.cur_page)
|
||||
|
||||
def _handle_tables(self, shape):
|
||||
"""将PowerPoint表格转换为HTML格式。
|
||||
|
||||
Args:
|
||||
shape: 包含表格的形状对象。
|
||||
parent_slide: 父幻灯片组。
|
||||
slide_ind: 当前幻灯片索引。
|
||||
doc: 文档对象(此实现中未使用)。
|
||||
slide_size: 幻灯片尺寸。
|
||||
|
||||
Returns:
|
||||
str: 表格的HTML字符串,如果没有表格则返回None。
|
||||
"""
|
||||
if not shape.has_table:
|
||||
return None
|
||||
|
||||
table = shape.table
|
||||
table_xml = shape._element
|
||||
|
||||
# 开始构建HTML表格
|
||||
html_parts = ['<table border="1">']
|
||||
|
||||
# 跟踪已被合并单元格占用的位置
|
||||
# 格式: {(row, col): True}
|
||||
occupied_cells = {}
|
||||
|
||||
for row_idx, row in enumerate(table.rows):
|
||||
html_parts.append(" <tr>")
|
||||
|
||||
for col_idx, cell in enumerate(row.cells):
|
||||
# 跳过被合并占用的单元格
|
||||
if (row_idx, col_idx) in occupied_cells:
|
||||
continue
|
||||
# 获取单元格XML以读取跨度信息
|
||||
cell_xml = table_xml.xpath(
|
||||
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
|
||||
)
|
||||
|
||||
if not cell_xml:
|
||||
continue
|
||||
|
||||
cell_xml = cell_xml[0]
|
||||
|
||||
# 解析行跨度和列跨度
|
||||
row_span = cell_xml.get("rowSpan")
|
||||
col_span = cell_xml.get("gridSpan")
|
||||
|
||||
row_span = int(row_span) if row_span else 1
|
||||
col_span = int(col_span) if col_span else 1
|
||||
|
||||
# 标记被此单元格占用的位置
|
||||
for r in range(row_idx, row_idx + row_span):
|
||||
for c in range(col_idx, col_idx + col_span):
|
||||
if (r, c) != (row_idx, col_idx):
|
||||
occupied_cells[(r, c)] = True
|
||||
|
||||
# 确定标签类型:第一行使用<th>,其他使用<td>
|
||||
tag = "th" if row_idx == 0 else "td"
|
||||
|
||||
# 构建属性字符串
|
||||
attrs = []
|
||||
if row_span > 1:
|
||||
attrs.append(f'rowspan="{row_span}"')
|
||||
if col_span > 1:
|
||||
attrs.append(f'colspan="{col_span}"')
|
||||
|
||||
attr_str = " " + " ".join(attrs) if attrs else ""
|
||||
|
||||
# 获取单元格文本内容
|
||||
cell_text = cell.text.strip() if cell.text else ""
|
||||
# 转义HTML特殊字符,防止XSS
|
||||
cell_text = (
|
||||
cell_text.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
)
|
||||
|
||||
html_parts.append(f" <{tag}{attr_str}>{cell_text}</{tag}>")
|
||||
|
||||
html_parts.append(" </tr>")
|
||||
|
||||
html_parts.append("</table>")
|
||||
|
||||
self.cur_page.append(
|
||||
{
|
||||
"type": BlockType.TABLE,
|
||||
"content": "\n".join(html_parts),
|
||||
}
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def _handle_pictures(self, shape):
|
||||
# 使用PIL打开图像
|
||||
try:
|
||||
# 获取图像字节数据
|
||||
image = shape.image
|
||||
image_bytes = image.blob
|
||||
im_dpi, _ = image.dpi
|
||||
pil_image = Image.open(BytesIO(image_bytes))
|
||||
|
||||
if isinstance(pil_image, WmfImagePlugin.WmfStubImageFile):
|
||||
logger.warning(f"Skipping WMF image, size: {pil_image.size}")
|
||||
placeholder = Image.new("RGB", pil_image.size, (240, 240, 240))
|
||||
img_base64 = image_to_b64str(placeholder)
|
||||
else:
|
||||
if pil_image.mode != "RGB":
|
||||
pil_image = pil_image.convert("RGB")
|
||||
img_base64 = image_to_b64str(pil_image)
|
||||
image_block = {
|
||||
"type": BlockType.IMAGE,
|
||||
"content": img_base64,
|
||||
}
|
||||
self.cur_page.append(image_block)
|
||||
|
||||
except (UnidentifiedImageError, OSError) as e:
|
||||
logger.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
||||
return
|
||||
|
||||
def _handle_text_elements(self, shape):
|
||||
is_list_group_created = False
|
||||
enum_list_item_value = 0
|
||||
new_list = None
|
||||
|
||||
# 遍历段落以构建文本
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
is_a_list, bullet_type = self._is_list_item(paragraph)
|
||||
p = paragraph._element
|
||||
|
||||
# 将换行符转换为空格并累积文本
|
||||
p_text = ""
|
||||
for e in p.content_children:
|
||||
if isinstance(e, CT_TextLineBreak):
|
||||
p_text += " "
|
||||
else:
|
||||
p_text += e.text
|
||||
|
||||
if is_a_list:
|
||||
enum_marker = ""
|
||||
enumerated = bullet_type == "Numbered"
|
||||
|
||||
if not is_list_group_created:
|
||||
if enumerated:
|
||||
list_attribute = "ordered"
|
||||
else:
|
||||
list_attribute = "unordered"
|
||||
new_list_block = {
|
||||
"type": BlockType.LIST,
|
||||
"attribute": list_attribute,
|
||||
"list_items": [],
|
||||
}
|
||||
self.cur_page.append(new_list_block)
|
||||
self.list_block_stack.append(new_list_block)
|
||||
is_list_group_created = True
|
||||
enum_list_item_value = 0
|
||||
|
||||
if enumerated:
|
||||
enum_list_item_value += 1
|
||||
enum_marker = str(enum_list_item_value) + ". "
|
||||
|
||||
list_item = {
|
||||
"type": BlockType.TEXT,
|
||||
"content": enum_marker + p_text,
|
||||
}
|
||||
self.list_block_stack[-1]["list_items"].append(list_item)
|
||||
else: # 段落不是列表项
|
||||
if is_list_group_created:
|
||||
is_list_group_created = False
|
||||
new_list = None
|
||||
enum_list_item_value = 0
|
||||
self.list_block_stack.pop()
|
||||
# 根据文本类型分配标签(标题/部分标题/段落等)
|
||||
label = BlockType.TEXT
|
||||
if shape.is_placeholder:
|
||||
placeholder_type = shape.placeholder_format.type
|
||||
if placeholder_type in [
|
||||
PP_PLACEHOLDER.CENTER_TITLE,
|
||||
PP_PLACEHOLDER.TITLE,
|
||||
PP_PLACEHOLDER.SUBTITLE,
|
||||
]:
|
||||
label = BlockType.TITLE
|
||||
|
||||
# 输出累积的内联文本
|
||||
self.cur_page.append(
|
||||
{
|
||||
"type": label,
|
||||
"content": p_text,
|
||||
}
|
||||
)
|
||||
return
|
||||
|
||||
def _is_list_item(self, paragraph) -> tuple[bool, str]:
|
||||
"""
|
||||
判断段落是否应被视为列表项。
|
||||
该方法首先尝试通过拥有该段落的形状来解析列表样式信息。
|
||||
如果无法做到,则回退到基于段落属性和级别的更简单检查。
|
||||
Args:
|
||||
paragraph: 需要检查的'python-pptx'段落对象。
|
||||
|
||||
Returns:
|
||||
返回一个2元组(`is_list`, `bullet_type`),其中:
|
||||
`is_list` - 若段落被视为列表项,为True,否则为False;
|
||||
`bullet_type` - 为以下之一:'Bullet'(项目符号)、'Numbered'(编号)或'None',
|
||||
描述列表标记类型。
|
||||
"""
|
||||
p = paragraph._element
|
||||
|
||||
# 尝试从段落获取形状(包含该段落的对象),如果可能的话
|
||||
shape = None
|
||||
try:
|
||||
# 这个路径适用于python-pptx段落对象
|
||||
# 首先获取文本框架(段落的父对象)
|
||||
text_frame = paragraph._parent
|
||||
# 然后获取形状(文本框架的父对象)
|
||||
shape = text_frame._parent
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
if shape is not None:
|
||||
marker_info = self._get_effective_list_marker(shape, paragraph)
|
||||
|
||||
# 检查这是否肯定是一个列表项
|
||||
if marker_info["is_list"] is True or marker_info["kind"] in (
|
||||
"buChar",
|
||||
"buAutoNum",
|
||||
"buBlip",
|
||||
):
|
||||
if marker_info["kind"] == "buChar":
|
||||
return (True, "Bullet")
|
||||
elif marker_info["kind"] == "buAutoNum":
|
||||
return (True, "Numbered")
|
||||
else:
|
||||
return (True, "None")
|
||||
|
||||
# 检查这是否肯定不是列表项
|
||||
if marker_info["is_list"] is False:
|
||||
return (False, "None")
|
||||
|
||||
# 回退到段落级别检查(缩进级别大于0时视为列表项)
|
||||
if paragraph.level > 0:
|
||||
return (True, "None")
|
||||
|
||||
return (False, "None")
|
||||
|
||||
# 如果无法获取形状,使用更简单的检查方式
|
||||
if p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]}) is not None:
|
||||
return (True, "Bullet")
|
||||
elif (
|
||||
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]}) is not None
|
||||
):
|
||||
return (True, "Numbered")
|
||||
elif paragraph.level > 0:
|
||||
# 很可能是子列表项(缩进表示嵌套)
|
||||
return (True, "None")
|
||||
else:
|
||||
return (False, "None")
|
||||
|
||||
def _get_effective_list_marker(self, shape, paragraph) -> dict:
|
||||
"""
|
||||
返回描述段落的有效列表标记的字典。
|
||||
列表标记信息可以来自多个来源:直接段落属性、形状级别的列表样式、
|
||||
布局占位符或主幻灯片文本样式。此辅助方法解析所有这些层,并返回
|
||||
有效标记的统一视图。
|
||||
|
||||
Args:
|
||||
shape: 包含段落的形状对象。
|
||||
paragraph: 需要检查的'python-pptx'段落对象。
|
||||
|
||||
Returns:
|
||||
返回列表标记信息的字典,其中:
|
||||
`is_list` - True/False/None,表示这是否是列表项;
|
||||
`kind` - 为以下之一:`buChar`、`buAutoNum`、`buBlip`、`buNone`或None,描述标记类型;
|
||||
`detail` - 项目符号字符或编号类型字符串,或如果不适用则为None;
|
||||
`level` - 段落级别,范围在(0, 8)内。
|
||||
"""
|
||||
p = paragraph._element
|
||||
lvl = self._get_paragraph_level(p)
|
||||
|
||||
# 1) 直接段落属性
|
||||
pPr = p.find("a:pPr", namespaces=self.namespaces)
|
||||
is_list, kind, detail = self._parse_bullet_from_paragraph_properties(pPr)
|
||||
if is_list is not None:
|
||||
return {
|
||||
"is_list": is_list,
|
||||
"kind": kind,
|
||||
"detail": detail,
|
||||
"level": lvl,
|
||||
}
|
||||
|
||||
# 2) 形状级别的列表样式(txBody/a:lstStyle)
|
||||
txBody = shape._element.find(".//p:txBody", namespaces=self.namespaces)
|
||||
is_list, kind, detail = self._parse_bullet_from_text_body_list_style(
|
||||
txBody, lvl
|
||||
)
|
||||
if is_list is not None:
|
||||
return {
|
||||
"is_list": is_list,
|
||||
"kind": kind,
|
||||
"detail": detail,
|
||||
"level": lvl,
|
||||
}
|
||||
|
||||
# 3) 布局占位符列表样式(如果这是一个占位符)
|
||||
layout_result = None
|
||||
if shape.is_placeholder:
|
||||
idx = shape.placeholder_format.idx
|
||||
layout = shape.part.slide.slide_layout
|
||||
layout_ph = None
|
||||
try:
|
||||
layout_ph = layout.placeholders.get(idx)
|
||||
except Exception:
|
||||
layout_ph = None
|
||||
|
||||
if layout_ph is not None:
|
||||
layout_tx = layout_ph._element.find(
|
||||
".//p:txBody", namespaces=self.namespaces
|
||||
)
|
||||
is_list, kind, detail = self._parse_bullet_from_text_body_list_style(
|
||||
layout_tx, lvl
|
||||
)
|
||||
|
||||
# 仅在is_list明确为True/False时使用布局结果
|
||||
if is_list is not None:
|
||||
layout_result = {
|
||||
"is_list": is_list,
|
||||
"kind": kind,
|
||||
"detail": detail,
|
||||
"level": lvl,
|
||||
}
|
||||
|
||||
# 4) 解析主文本样式
|
||||
ph_type = shape.placeholder_format.type
|
||||
master = shape.part.slide.slide_layout.slide_master
|
||||
is_list, kind, detail = self._parse_bullet_from_master_text_styles(
|
||||
master, ph_type, lvl
|
||||
)
|
||||
|
||||
# 检查主样式是否有标记信息
|
||||
if kind in ("buChar", "buAutoNum", "buBlip"):
|
||||
return {
|
||||
"is_list": True,
|
||||
"kind": kind,
|
||||
"detail": detail,
|
||||
"level": lvl,
|
||||
}
|
||||
elif is_list is not None:
|
||||
return {
|
||||
"is_list": is_list,
|
||||
"kind": kind,
|
||||
"detail": detail,
|
||||
"level": lvl,
|
||||
}
|
||||
|
||||
# If layout has explicit is_list value but master didn't override it, use layout
|
||||
# 如果布局有显式的is_list值但主样式没有覆盖它,则使用布局结果
|
||||
if layout_result is not None:
|
||||
return layout_result
|
||||
|
||||
return {
|
||||
"is_list": None,
|
||||
"kind": None,
|
||||
"detail": None,
|
||||
"level": lvl,
|
||||
}
|
||||
|
||||
def _get_paragraph_level(self, paragraph) -> int:
|
||||
"""
|
||||
返回段落XML元素的缩进级别。
|
||||
段落可以有不同的缩进级别(0-8)。级别存储在段落属性XML元素的'lvl'属性中。
|
||||
|
||||
Args:
|
||||
paragraph: 需要提取级别的段落XML元素。
|
||||
|
||||
Returns:
|
||||
返回范围在(0, 8)内的段落级别。当找不到'a:pPr'元素、没有'lvl'属性
|
||||
或'lvl'属性值无效时,返回0。
|
||||
"""
|
||||
pPr = paragraph.find("a:pPr", namespaces=self.namespaces)
|
||||
if pPr is not None and "lvl" in pPr.attrib:
|
||||
try:
|
||||
return int(pPr.get("lvl"))
|
||||
except ValueError:
|
||||
pass
|
||||
return 0
|
||||
|
||||
def _parse_bullet_from_paragraph_properties(
|
||||
self, pPr
|
||||
) -> tuple[Optional[bool], Optional[str], Optional[str]]:
|
||||
"""
|
||||
从段落属性节点解析项目符号或编号信息。
|
||||
检查'a:pPr'或'a:lvlXpPr'元素,并提取关于项目符号字符、自动编号、
|
||||
图片项目符号或显式'buNone'标记的信息。
|
||||
|
||||
Args:
|
||||
pPr: 段落属性XML元素('a:pPr'或'a:lvlXpPr')。
|
||||
|
||||
Returns:
|
||||
返回一个3元组(`is_list`, `kind`, `detail`),其中:
|
||||
`is_list` - 为True/False/None,表示这是否是列表项;
|
||||
`kind` - 为以下之一:`buChar`(项目符号字符)、`buAutoNum`(自动编号)、
|
||||
`buBlip`(图片项目符号)、`buNone`(无标记)或None,描述标记类型;
|
||||
`detail` - 项目符号字符、编号类型字符串,或如果不适用则为None。
|
||||
"""
|
||||
if pPr is None:
|
||||
return (None, None, None)
|
||||
|
||||
# 显式指定无项目符号
|
||||
if pPr.find("a:buNone", namespaces=self.namespaces) is not None:
|
||||
return (False, "buNone", None)
|
||||
|
||||
# 项目符号字符
|
||||
buChar = pPr.find("a:buChar", namespaces=self.namespaces)
|
||||
if buChar is not None:
|
||||
return (True, "buChar", buChar.get("char"))
|
||||
|
||||
# 自动编号
|
||||
buAuto = pPr.find("a:buAutoNum", namespaces=self.namespaces)
|
||||
if buAuto is not None:
|
||||
return (True, "buAutoNum", buAuto.get("type"))
|
||||
|
||||
# 图片项目符号
|
||||
buBlip = pPr.find("a:buBlip", namespaces=self.namespaces)
|
||||
if buBlip is not None:
|
||||
return (True, "buBlip", "image")
|
||||
|
||||
return (None, None, None)
|
||||
|
||||
def _parse_bullet_from_text_body_list_style(
|
||||
self, txBody, lvl: int
|
||||
) -> tuple[Optional[bool], Optional[str], Optional[str]]:
|
||||
"""
|
||||
从文本体的列表样式中解析项目符号或编号信息。
|
||||
在'txBody'下搜索'a:lstStyle/a:lvl{lvl+1}pPr',并使用级别特定的段落属性
|
||||
推断项目符号或编号信息。
|
||||
|
||||
Args:
|
||||
txBody: 文本体XML元素'p:txBody'。
|
||||
lvl: 段落级别,范围在(0, 8)内。
|
||||
Returns:
|
||||
返回一个3元组(`is_list`, `kind`, `detail`),其中:
|
||||
`is_list` - 为True/False/None,表示这是否是列表项;
|
||||
`kind` - 为以下之一:`buChar`、`buAutoNum`、`buBlip`、`buNone`或None;
|
||||
`detail` - 项目符号字符、编号类型字符串,或如果不适用则为None。
|
||||
"""
|
||||
if txBody is None:
|
||||
return (None, None, None)
|
||||
lstStyle = txBody.find("a:lstStyle", namespaces=self.namespaces)
|
||||
lvl_pPr = self._find_level_properties_in_list_style(lstStyle, lvl)
|
||||
is_list, kind, detail = self._parse_bullet_from_paragraph_properties(lvl_pPr)
|
||||
return (is_list, kind, detail)
|
||||
|
||||
def _parse_bullet_from_master_text_styles(
|
||||
self, slide_master, placeholder_type, lvl: int
|
||||
) -> tuple[Optional[bool], Optional[str], Optional[str]]:
|
||||
"""
|
||||
从主幻灯片的文本样式中解析项目符号或编号信息。
|
||||
在主幻灯片的'p:txStyles'中查找相应的样式bucket('titleStyle'、'bodyStyle'或
|
||||
'otherStyle'),并为给定的级别提取项目符号或编号信息。
|
||||
|
||||
Args:
|
||||
slide_master: 与当前幻灯片关联的主幻灯片对象。
|
||||
placeholder_type: 来自'PP_PLACEHOLDER'的占位符类型枚举。
|
||||
lvl: 段落级别,范围在(0, 8)内。
|
||||
|
||||
Returns:
|
||||
返回一个3元组(`is_list`, `kind`, `detail`),其中:
|
||||
`is_list` - 为True/False/None,表示这是否是列表项;
|
||||
`kind` - 为以下之一:`buChar`、`buAutoNum`、`buBlip`、`buNone`或None;
|
||||
`detail` - 项目符号字符、编号类型字符串,或如果不适用则为None。
|
||||
"""
|
||||
style = self._get_master_text_style_node(slide_master, placeholder_type)
|
||||
if style is None:
|
||||
return (None, None, None)
|
||||
|
||||
lvl_pPr = style.find(f".//a:lvl{lvl + 1}pPr", namespaces=self.namespaces)
|
||||
is_list, kind, detail = self._parse_bullet_from_paragraph_properties(lvl_pPr)
|
||||
return (is_list, kind, detail)
|
||||
|
||||
def _find_level_properties_in_list_style(self, lstStyle, lvl: int):
|
||||
"""Find the level-specific paragraph properties node from a list style.
|
||||
从列表样式中查找指定级别的段落属性节点。
|
||||
|
||||
This looks for an `a:lvl{lvl+1}pPr` node inside an `a:lstStyle` element, where
|
||||
在'a:lstStyle'元素内查找'a:lvl{lvl+1}pPr'节点,其中'a:lvl1pPr'对应级别0,
|
||||
`a:lvl1pPr` corresponds to level 0, `a:lvl2pPr` to level 1, and so on.
|
||||
'a:lvl2pPr'对应级别1,依此类推。
|
||||
|
||||
Args:
|
||||
lstStyle: List style XML element `a:lstStyle`.
|
||||
lstStyle: 列表样式XML元素'a:lstStyle'。
|
||||
lvl: Paragraph level in the range (0, 8).
|
||||
lvl: 段落级别,范围在(0, 8)内。
|
||||
|
||||
Returns:
|
||||
Matching `a:lvl{lvl+1}pPr` XML element, or None if no matching element is
|
||||
匹配的'a:lvl{lvl+1}pPr'XML元素,如果未找到匹配元素则返回None。
|
||||
found.
|
||||
"""
|
||||
if lstStyle is None:
|
||||
return None
|
||||
tag = f"a:lvl{lvl + 1}pPr"
|
||||
return lstStyle.find(tag, namespaces=self.namespaces)
|
||||
|
||||
def _get_master_text_style_node(
|
||||
self, slide_master, placeholder_type
|
||||
) -> Optional[etree._Element]:
|
||||
"""
|
||||
获取占位符的相应主文本样式节点。
|
||||
大多数内容占位符(BODY/OBJECT)使用'p:bodyStyle',而标题使用'p:titleStyle'。
|
||||
所有其他占位符默认使用'p:otherStyle'。
|
||||
|
||||
Args:
|
||||
slide_master: 与当前幻灯片关联的主幻灯片对象。
|
||||
placeholder_type: 来自'PP_PLACEHOLDER'的占位符类型枚举。
|
||||
|
||||
Returns:
|
||||
从主幻灯片的'p:txStyles'中匹配的样式节点('p:bodyStyle'、'p:titleStyle'或'p:otherStyle'),或当未定义样式时返回None。
|
||||
"""
|
||||
txStyles = slide_master._element.find(
|
||||
".//p:txStyles", namespaces=self.namespaces
|
||||
)
|
||||
if txStyles is None:
|
||||
return None
|
||||
|
||||
if placeholder_type in (PP_PLACEHOLDER.BODY, PP_PLACEHOLDER.OBJECT):
|
||||
return txStyles.find("p:bodyStyle", namespaces=self.namespaces)
|
||||
|
||||
if placeholder_type == PP_PLACEHOLDER.TITLE:
|
||||
return txStyles.find("p:titleStyle", namespaces=self.namespaces)
|
||||
|
||||
return txStyles.find("p:otherStyle", namespaces=self.namespaces)
|
||||
21
mineru/utils/docx_fomatting.py
Normal file
21
mineru/utils/docx_fomatting.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class Script(str, Enum):
|
||||
"""Text script position."""
|
||||
|
||||
BASELINE = "baseline"
|
||||
SUB = "sub"
|
||||
SUPER = "super"
|
||||
|
||||
|
||||
class Formatting(BaseModel):
|
||||
"""Formatting."""
|
||||
|
||||
bold: bool = False
|
||||
italic: bool = False
|
||||
underline: bool = False
|
||||
strikethrough: bool = False
|
||||
script: Script = Script.BASELINE
|
||||
@@ -5,6 +5,7 @@ class BlockType:
|
||||
TABLE = 'table'
|
||||
IMAGE_BODY = 'image_body'
|
||||
TABLE_BODY = 'table_body'
|
||||
CAPTION = 'caption' # word的通用caption类型
|
||||
IMAGE_CAPTION = 'image_caption'
|
||||
TABLE_CAPTION = 'table_caption'
|
||||
IMAGE_FOOTNOTE = 'image_footnote'
|
||||
@@ -12,6 +13,7 @@ class BlockType:
|
||||
TEXT = 'text'
|
||||
TITLE = 'title'
|
||||
INTERLINE_EQUATION = 'interline_equation'
|
||||
EQUATION = "equation" # 公式(独立公式)
|
||||
LIST = 'list'
|
||||
INDEX = 'index'
|
||||
DISCARDED = 'discarded'
|
||||
@@ -129,4 +131,4 @@ class NotExtractType(Enum):
|
||||
IMAGE_CAPTION = BlockType.IMAGE_CAPTION
|
||||
TABLE_FOOTNOTE = BlockType.TABLE_FOOTNOTE
|
||||
IMAGE_FOOTNOTE = BlockType.IMAGE_FOOTNOTE
|
||||
CODE_CAPTION = BlockType.CODE_CAPTION
|
||||
CODE_CAPTION = BlockType.CODE_CAPTION
|
||||
|
||||
@@ -21,7 +21,7 @@ dependencies = [
|
||||
"click>=8.1.7",
|
||||
"loguru>=0.7.2",
|
||||
"numpy>=1.21.6",
|
||||
"pdfminer.six==20250506",
|
||||
"pdfminer.six==20251230",
|
||||
"tqdm>=4.67.1",
|
||||
"requests",
|
||||
"httpx",
|
||||
@@ -37,10 +37,17 @@ dependencies = [
|
||||
"fast-langdetect>=0.2.3,<0.3.0",
|
||||
"scikit-image>=0.25.0,<1.0.0",
|
||||
"openai>=1.70.0,<3",
|
||||
"beautifulsoup4>=4.13.5,<5",
|
||||
"beautifulsoup4>=4.13.5,<5",
|
||||
"magika>=0.6.2,<1.1.0",
|
||||
"mineru-vl-utils>=0.1.19.1,<1",
|
||||
"qwen-vl-utils>=0.0.14,<1",
|
||||
"python-docx>=1.2.0,<2",
|
||||
'pypptx-with-oxml>=1.0.3,<2',
|
||||
"mammoth>=1.11.0,<2",
|
||||
"pylatexenc>=2.10,<3",
|
||||
"lxml>=4.0.0,<7.0.0",
|
||||
"pandas>=2.3.3,<3",
|
||||
"openpyxl>=3.1.5,<4",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
Reference in New Issue
Block a user