Files
dify-docs/scripts/md-to-mdx-3.18-backup.py
2025-03-18 16:02:34 +08:00

340 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import shutil
from pathlib import Path
import logging
# 设置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("conversion.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger("md-to-mdx")
class MarkdownToMDXConverter:
def __init__(self, backup=True):
self.backup = backup
self.conversion_count = 0
self.error_count = 0
self.base_output_dir = None
def process_directory(self, input_dir, output_dir=None, recursive=True):
"""处理指定目录中的所有Markdown文件"""
input_path = Path(input_dir)
if not input_path.exists():
logger.error(f"输入目录不存在: {input_dir}")
return
# 保存基础输出目录,用于构建子目录输出路径
if self.base_output_dir is None and output_dir:
self.base_output_dir = Path(output_dir)
self.base_input_dir = input_path
self.base_output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"创建基础输出目录: {self.base_output_dir}")
# 处理当前目录中的所有.md文件
for file in input_path.glob("*.md"):
# 计算相对于基础输入目录的路径
if self.base_output_dir:
rel_path = file.parent.relative_to(self.base_input_dir) if file.parent != self.base_input_dir else Path('')
target_dir = self.base_output_dir / rel_path
target_dir.mkdir(parents=True, exist_ok=True)
self._process_file(file, target_dir)
else:
# 如果没有基础输出目录,则就地处理
self._process_file(file, file.parent)
# 如果需要递归处理子目录
if recursive:
for subdir in [d for d in input_path.iterdir() if d.is_dir()]:
# 跳过output目录避免重复处理
if subdir.name == "output" or subdir.name.startswith('.'):
continue
self.process_directory(subdir, output_dir, recursive)
def _process_file(self, file_path, output_dir):
"""处理单个Markdown文件"""
try:
logger.info(f"处理文件: {file_path}")
# 备份原始文件
if self.backup:
backup_file = str(file_path) + ".bak"
if not os.path.exists(backup_file):
shutil.copy2(file_path, backup_file)
logger.info(f"已创建备份: {backup_file}")
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 执行转换
converted_content = self.convert_content(content)
# 确定输出文件路径
output_file = output_dir / (file_path.stem + ".mdx")
# 写入转换后的内容
with open(output_file, 'w', encoding='utf-8') as f:
f.write(converted_content)
logger.info(f"转换完成: {output_file}")
self.conversion_count += 1
except Exception as e:
logger.error(f"处理文件 {file_path} 时出错: {str(e)}")
self.error_count += 1
def convert_content(self, content):
"""将Gitbook Markdown内容转换为Mintlify MDX格式"""
# 1. 转换文档开头的h1元素为frontmatter
h1_pattern = re.compile(r'^#\s+(.+?)$', re.MULTILINE)
match = h1_pattern.search(content)
if match:
title = match.group(1).strip()
content = h1_pattern.sub(f'---\ntitle: {title}\n---\n', content, count=1)
# 2. 转换hint提示框
hint_pattern = re.compile(
r'{%\s*hint\s+style="(\w+)"\s*%}(.*?){%\s*endhint\s*%}',
re.DOTALL
)
def hint_replacer(match):
style = match.group(1)
text = match.group(2).strip()
component_name = style.capitalize() if style != "info" else "Info"
return f'<{component_name}>\n{text}\n</{component_name}>'
content = hint_pattern.sub(hint_replacer, content)
# 3. 转换卡片链接
card_pattern = re.compile(
r'{%\s*content-ref\s+url="([^"]+)"\s*%}\s*\[([^\]]+)\]\(([^)]+)\)\s*{%\s*endcontent-ref\s*%}',
re.DOTALL
)
def card_replacer(match):
url = match.group(1)
title = match.group(2)
return f'<Card title="{title}" icon="link" href="{url}">\n {title}\n</Card>'
content = card_pattern.sub(card_replacer, content)
# 4. 转换并排图片样式
# 寻找连续的图片并转换为并排布局
img_pattern = re.compile(r'!\[(.*?)\]\((.*?)\)\s*!\[(.*?)\]\((.*?)\)', re.DOTALL)
def img_side_replacer(match):
alt1 = match.group(1) or "Image 1"
src1 = match.group(2)
alt2 = match.group(3) or "Image 2"
src2 = match.group(4)
return f'''<div class="image-side-by-side">
<figure>
<img src="{src1}" alt="{alt1}" />
</figure>
<figure>
<img src="{src2}" alt="{alt2}" />
</figure>
</div>'''
content = img_pattern.sub(img_side_replacer, content)
# 5. 转换Frame包装的图片
frame_pattern = re.compile(r'<Frame>\s*<img\s+src="([^"]+)"\s+alt="([^"]+)"\s*/>\s*</Frame>', re.DOTALL)
def frame_replacer(match):
src = match.group(1)
alt = match.group(2)
return f'<figure>\n <img src="{src}" alt="{alt}" />\n</figure>'
content = frame_pattern.sub(frame_replacer, content)
# 6. 转换Tabs组件
# 先匹配整个tabs块
tabs_pattern = re.compile(
r'{%\s*tabs\s*%}(.*?){%\s*endtabs\s*%}',
re.DOTALL
)
def tabs_replacer(match):
tabs_content = match.group(1)
# 匹配每个tab
tab_pattern = re.compile(
r'{%\s*tab\s+title="([^"]+)"\s*%}(.*?){%\s*endtab\s*%}',
re.DOTALL
)
# 构建新的Tabs组件
tabs_start = "<Tabs>"
tabs_items = []
for tab_match in tab_pattern.finditer(tabs_content):
title = tab_match.group(1)
content = tab_match.group(2).strip()
tabs_items.append(f' <Tab title="{title}">\n {content}\n </Tab>')
tabs_end = "</Tabs>"
return tabs_start + "\n" + "\n".join(tabs_items) + "\n" + tabs_end
content = tabs_pattern.sub(tabs_replacer, content)
# 7. 处理有限制大小的图片
img_size_pattern = re.compile(r'<img\s+src="([^"]+)"\s+width="(\d+)"(?:\s+alt="([^"]*)")?\s*/>', re.DOTALL)
def img_size_replacer(match):
src = match.group(1)
width = match.group(2)
alt = match.group(3) if match.group(3) else ""
return f'''<img
src="{src}"
width="{width}"
className="mx-auto"
alt="{alt}"
/>'''
content = img_size_pattern.sub(img_size_replacer, content)
# 8. 将markdown表格转换为MDX表格格式
# 使用正则表达式匹配markdown表格
table_pattern = re.compile(r'(\|.*\|\n\|[-:\s|]*\|\n(?:\|.*\|\n)+)', re.MULTILINE)
def table_replacer(match):
md_table = match.group(1)
lines = md_table.strip().split('\n')
# 提取表头和表体
header_row = lines[0]
header_cells = [cell.strip() for cell in header_row.split('|')[1:-1]]
# 忽略分隔行
body_rows = lines[2:]
body_cells_rows = []
for row in body_rows:
cells = [cell.strip() for cell in row.split('|')[1:-1]]
body_cells_rows.append(cells)
# 按照要求的格式构建MDX表格
mdx_table = "<table>\n <thead>\n <tr>\n"
# 添加表头
for cell in header_cells:
mdx_table += f" <th>{cell}</th>\n"
mdx_table += " </tr>\n </thead>\n <tbody>\n"
# 添加表体
for row_cells in body_cells_rows:
mdx_table += " <tr>\n"
for cell in row_cells:
# 先转换Markdown链接为HTML链接
# 匹配 [text](url) 格式
link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
cell = link_pattern.sub(r'<a href="\2">\1</a>', cell)
# 替换<br>标签为</p><p>,实现正确的段落分隔
# 先处理<br>标签(可能有不同形式:<br>, <br/>, <br />
br_pattern = re.compile(r'<br\s*/?>')
# 处理单元格中的<p>和<br>标签
if '<p>' in cell or br_pattern.search(cell):
# 如果已有<p>标签但包含<br>,替换<br>为</p><p>
if '<p>' in cell and br_pattern.search(cell):
cell = br_pattern.sub(r'</p>\n <p>', cell)
# 清理末尾的空<br>标签
cell = re.sub(r'<br\s*/?>(\s*</p>)', r'\1', cell)
# 如果没有<p>标签但有<br>,用<p>标签包装每个段落
elif br_pattern.search(cell) and not '<p>' in cell:
paragraphs = br_pattern.split(cell)
cell = '<p>' + '</p>\n <p>'.join([p.strip() for p in paragraphs if p.strip()]) + '</p>'
# 确保缩进正确
mdx_table += f" <td>\n {cell}\n </td>\n"
else:
# 普通文本单元格
mdx_table += f" <td>{cell}</td>\n"
mdx_table += " </tr>\n"
mdx_table += " </tbody>\n</table>"
return mdx_table
content = table_pattern.sub(table_replacer, content)
return content
def get_statistics(self):
"""返回处理统计信息"""
return {
"conversion_count": self.conversion_count,
"error_count": self.error_count
}
def main():
print("=" * 60)
print("Gitbook Markdown 转 Mintlify MDX 转换工具")
print("=" * 60)
# 通过交互方式获取输入路径
input_path_str = input("请输入源文件或目录路径: ")
input_path = Path(input_path_str)
if not input_path.exists():
print(f"错误: 路径 '{input_path_str}' 不存在!")
return
# 询问是否递归处理子目录
recursive = False
if input_path.is_dir():
recursive_input = input("是否递归处理所有子目录? (y/n): ").lower()
recursive = recursive_input in ('y', 'yes')
# 询问是否创建备份
backup_input = input("是否创建备份文件? (y/n, 默认:y): ").lower()
create_backup = backup_input in ('', 'y', 'yes')
# 创建output目录
if input_path.is_file():
output_dir = input_path.parent / "output"
else:
output_dir = input_path / "output"
# 创建转换器并处理文件
converter = MarkdownToMDXConverter(backup=create_backup)
if input_path.is_file() and input_path.suffix.lower() == '.md':
# 处理单个文件
output_dir.mkdir(parents=True, exist_ok=True)
print(f"输出目录已创建: {output_dir}")
converter._process_file(input_path, output_dir)
elif input_path.is_dir():
# 处理目录
converter.process_directory(input_path, output_dir, recursive)
else:
logger.error(f"无效的输入路径: {input_path_str}")
print(f"错误: '{input_path_str}' 不是有效的Markdown文件或目录!")
return
# 打印统计信息
stats = converter.get_statistics()
print("=" * 60)
print(f"转换完成! 成功转换: {stats['conversion_count']}个文件, 错误: {stats['error_count']}个文件")
print(f"转换结果已保存至: {output_dir}")
print("=" * 60)
if __name__ == "__main__":
main()