mirror of
https://github.com/langgenius/dify-docs.git
synced 2026-03-26 13:18:34 +07:00
452 lines
17 KiB
Python
452 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
import os
|
||
import re
|
||
import shutil
|
||
from pathlib import Path
|
||
import logging
|
||
|
||
# 设置日志
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler("conversion.log"),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger("md-to-mdx")
|
||
|
||
class MarkdownToMDXConverter:
|
||
def __init__(self, backup=True, in_place=False):
|
||
self.backup = backup
|
||
self.in_place = in_place
|
||
self.conversion_count = 0
|
||
self.error_count = 0
|
||
self.base_output_dir = None
|
||
|
||
def process_directory(self, input_dir, output_dir=None, recursive=True):
|
||
"""处理指定目录中的所有Markdown文件"""
|
||
input_path = Path(input_dir)
|
||
|
||
if not input_path.exists():
|
||
logger.error(f"输入目录不存在: {input_dir}")
|
||
return
|
||
|
||
# 保存基础输出目录,用于构建子目录输出路径
|
||
if not self.in_place and self.base_output_dir is None and output_dir:
|
||
self.base_output_dir = Path(output_dir)
|
||
self.base_input_dir = input_path
|
||
self.base_output_dir.mkdir(parents=True, exist_ok=True)
|
||
logger.info(f"创建基础输出目录: {self.base_output_dir}")
|
||
|
||
# 处理当前目录中的所有.md和.mdx文件
|
||
for file in list(input_path.glob("*.md")) + list(input_path.glob("*.mdx")):
|
||
if self.in_place:
|
||
# 在原位置处理
|
||
self._process_file(file, file.parent, delete_original=True)
|
||
else:
|
||
# 计算相对于基础输入目录的路径
|
||
if self.base_output_dir:
|
||
rel_path = file.parent.relative_to(self.base_input_dir) if file.parent != self.base_input_dir else Path('')
|
||
target_dir = self.base_output_dir / rel_path
|
||
target_dir.mkdir(parents=True, exist_ok=True)
|
||
self._process_file(file, target_dir)
|
||
else:
|
||
# 如果没有基础输出目录,则就地处理
|
||
self._process_file(file, file.parent)
|
||
|
||
# 如果需要递归处理子目录
|
||
if recursive:
|
||
for subdir in [d for d in input_path.iterdir() if d.is_dir()]:
|
||
# 跳过output目录,避免重复处理
|
||
if subdir.name == "output" or subdir.name.startswith('.'):
|
||
continue
|
||
|
||
self.process_directory(subdir, output_dir, recursive)
|
||
|
||
def _process_file(self, file_path, output_dir, delete_original=False):
|
||
"""处理单个Markdown文件"""
|
||
try:
|
||
logger.info(f"处理文件: {file_path}")
|
||
|
||
# 备份原始文件(如果需要)
|
||
if self.backup:
|
||
backup_file = str(file_path) + ".bak"
|
||
if not os.path.exists(backup_file):
|
||
shutil.copy2(file_path, backup_file)
|
||
logger.info(f"已创建备份: {backup_file}")
|
||
|
||
# 读取文件内容
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 执行转换
|
||
converted_content = self.convert_content(content)
|
||
|
||
# 确定输出文件路径
|
||
output_file = output_dir / (file_path.stem + ".mdx")
|
||
|
||
# 写入转换后的内容
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
f.write(converted_content)
|
||
|
||
logger.info(f"转换完成: {output_file}")
|
||
self.conversion_count += 1
|
||
|
||
# 如果需要,删除原始文件
|
||
if delete_original:
|
||
try:
|
||
os.remove(file_path)
|
||
logger.info(f"已删除源文件: {file_path}")
|
||
except Exception as e:
|
||
logger.error(f"删除源文件 {file_path} 失败: {str(e)}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理文件 {file_path} 时出错: {str(e)}")
|
||
self.error_count += 1
|
||
|
||
def convert_content(self, content):
|
||
"""将Gitbook Markdown内容转换为Mintlify MDX格式"""
|
||
|
||
# 1. 转换文档开头的h1元素为frontmatter
|
||
h1_pattern = re.compile(r'^#\s+(.+?)$', re.MULTILINE)
|
||
match = h1_pattern.search(content)
|
||
if match:
|
||
title = match.group(1).strip()
|
||
content = h1_pattern.sub(f'---\ntitle: {title}\n---\n', content, count=1)
|
||
|
||
# 2. 转换hint提示框
|
||
hint_pattern = re.compile(
|
||
r'{%\s*hint\s+style="(\w+)"\s*%}(.*?){%\s*endhint\s*%}',
|
||
re.DOTALL
|
||
)
|
||
|
||
def hint_replacer(match):
|
||
style = match.group(1)
|
||
text = match.group(2).strip()
|
||
component_name = style.capitalize() if style != "info" else "Info"
|
||
return f'<{component_name}>\n{text}\n</{component_name}>'
|
||
|
||
content = hint_pattern.sub(hint_replacer, content)
|
||
|
||
# 3. 转换卡片链接
|
||
card_pattern = re.compile(
|
||
r'{%\s*content-ref\s+url="([^"]+)"\s*%}\s*\[([^\]]+)\]\(([^)]+)\)\s*{%\s*endcontent-ref\s*%}',
|
||
re.DOTALL
|
||
)
|
||
|
||
def card_replacer(match):
|
||
url = match.group(1)
|
||
title = match.group(2)
|
||
return f'<Card title="{title}" icon="link" href="{url}">\n {title}\n</Card>'
|
||
|
||
content = card_pattern.sub(card_replacer, content)
|
||
|
||
# 4. 转换并排图片样式
|
||
# 寻找连续的图片并转换为并排布局
|
||
img_pattern = re.compile(r'!\[(.*?)\]\((.*?)\)\s*!\[(.*?)\]\((.*?)\)', re.DOTALL)
|
||
|
||
def img_side_replacer(match):
|
||
alt1 = match.group(1) or "Image 1"
|
||
src1 = match.group(2)
|
||
alt2 = match.group(3) or "Image 2"
|
||
src2 = match.group(4)
|
||
|
||
return f'''<div class="image-side-by-side">
|
||
<figure>
|
||
<img src="{src1}" alt="{alt1}" />
|
||
</figure>
|
||
<figure>
|
||
<img src="{src2}" alt="{alt2}" />
|
||
</figure>
|
||
</div>'''
|
||
|
||
content = img_pattern.sub(img_side_replacer, content)
|
||
|
||
# 5. 转换Frame包装的图片
|
||
frame_pattern = re.compile(r'<Frame>\s*<img\s+src="([^"]+)"\s+alt="([^"]+)"\s*/>\s*</Frame>', re.DOTALL)
|
||
|
||
def frame_replacer(match):
|
||
src = match.group(1)
|
||
alt = match.group(2)
|
||
return f''
|
||
|
||
content = frame_pattern.sub(frame_replacer, content)
|
||
|
||
# 5.1 转换<figure><img>格式的带有宽度和figcaption的图片为特定格式
|
||
figure_img_width_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s+width="(\d+)"\s*/?>\s*<figcaption>(?:<p>)?(.*?)(?:</p>)?</figcaption>\s*</figure>', re.DOTALL)
|
||
|
||
def figure_img_width_caption_replacer(match):
|
||
src = match.group(1)
|
||
alt = match.group(2) or ""
|
||
width = match.group(3)
|
||
caption = match.group(4).strip()
|
||
|
||
# 如果有caption,将其添加到alt中
|
||
if caption:
|
||
alt = caption
|
||
|
||
return f'''<img
|
||
src="{src}"
|
||
width="{width}"
|
||
className="mx-auto"
|
||
alt="{alt}"
|
||
/>'''
|
||
|
||
content = figure_img_width_caption_pattern.sub(figure_img_width_caption_replacer, content)
|
||
|
||
# 5.2 转换<figure><img>格式的带有宽度但没有figcaption的图片
|
||
figure_img_width_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s+width="(\d+)"\s*/?>\s*</figure>', re.DOTALL)
|
||
|
||
def figure_img_width_replacer(match):
|
||
src = match.group(1)
|
||
alt = match.group(2) or ""
|
||
width = match.group(3)
|
||
|
||
return f'''<img
|
||
src="{src}"
|
||
width="{width}"
|
||
className="mx-auto"
|
||
alt="{alt}"
|
||
/>'''
|
||
|
||
content = figure_img_width_pattern.sub(figure_img_width_replacer, content)
|
||
|
||
# 5.3 转换<figure><img>格式的没有宽度但有figcaption的图片
|
||
figure_img_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s*/?>\s*<figcaption>(?:<p>)?(.*?)(?:</p>)?</figcaption>\s*</figure>', re.DOTALL)
|
||
|
||
def figure_img_caption_replacer(match):
|
||
src = match.group(1)
|
||
alt = match.group(2) or ""
|
||
caption = match.group(3).strip()
|
||
|
||
# 如果有caption,将其添加到alt中
|
||
if caption:
|
||
alt = caption
|
||
|
||
return f'''<img
|
||
src="{src}"
|
||
className="mx-auto"
|
||
alt="{alt}"
|
||
/>'''
|
||
|
||
content = figure_img_caption_pattern.sub(figure_img_caption_replacer, content)
|
||
|
||
# 5.4 处理没有figcaption和宽度的<figure><img>标签
|
||
figure_img_no_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s*/?>\s*</figure>', re.DOTALL)
|
||
|
||
def figure_img_no_caption_replacer(match):
|
||
src = match.group(1)
|
||
alt = match.group(2) or ""
|
||
|
||
return f'''<img
|
||
src="{src}"
|
||
className="mx-auto"
|
||
alt="{alt}"
|
||
/>'''
|
||
|
||
content = figure_img_no_caption_pattern.sub(figure_img_no_caption_replacer, content)
|
||
|
||
# 6. 转换Tabs组件
|
||
# 先匹配整个tabs块
|
||
tabs_pattern = re.compile(
|
||
r'{%\s*tabs\s*%}(.*?){%\s*endtabs\s*%}',
|
||
re.DOTALL
|
||
)
|
||
|
||
def tabs_replacer(match):
|
||
tabs_content = match.group(1)
|
||
# 匹配每个tab
|
||
tab_pattern = re.compile(
|
||
r'{%\s*tab\s+title="([^"]+)"\s*%}(.*?){%\s*endtab\s*%}',
|
||
re.DOTALL
|
||
)
|
||
|
||
# 构建新的Tabs组件
|
||
tabs_start = "<Tabs>"
|
||
tabs_items = []
|
||
|
||
for tab_match in tab_pattern.finditer(tabs_content):
|
||
title = tab_match.group(1)
|
||
content = tab_match.group(2).strip()
|
||
tabs_items.append(f' <Tab title="{title}">\n {content}\n </Tab>')
|
||
|
||
tabs_end = "</Tabs>"
|
||
|
||
return tabs_start + "\n" + "\n".join(tabs_items) + "\n" + tabs_end
|
||
|
||
content = tabs_pattern.sub(tabs_replacer, content)
|
||
|
||
# 7. 处理有限制大小的独立img标签
|
||
img_size_pattern = re.compile(r'<img\s+src="([^"]+)"\s+width="(\d+)"(?:\s+alt="([^"]*)")?\s*/>', re.DOTALL)
|
||
|
||
def img_size_replacer(match):
|
||
src = match.group(1)
|
||
width = match.group(2)
|
||
alt = match.group(3) if match.group(3) else ""
|
||
|
||
return f'''<img
|
||
src="{src}"
|
||
width="{width}"
|
||
className="mx-auto"
|
||
alt="{alt}"
|
||
/>'''
|
||
|
||
content = img_size_pattern.sub(img_size_replacer, content)
|
||
|
||
# 7.1 处理各种形式的独立<img>标签
|
||
standalone_img_pattern = re.compile(r'<img\s+src="([^"]+)"(?:\s+alt="([^"]*)")?[^>]*>', re.DOTALL)
|
||
|
||
def standalone_img_replacer(match):
|
||
src = match.group(1)
|
||
alt = match.group(2) if match.group(2) else ""
|
||
|
||
return f'''<img
|
||
src="{src}"
|
||
className="mx-auto"
|
||
alt="{alt}"
|
||
/>'''
|
||
|
||
content = standalone_img_pattern.sub(standalone_img_replacer, content)
|
||
|
||
# 8. 将markdown表格转换为MDX表格格式
|
||
# 使用正则表达式匹配markdown表格
|
||
table_pattern = re.compile(r'(\|.*\|\n\|[-:\s|]*\|\n(?:\|.*\|\n)+)', re.MULTILINE)
|
||
|
||
def table_replacer(match):
|
||
md_table = match.group(1)
|
||
lines = md_table.strip().split('\n')
|
||
|
||
# 提取表头和表体
|
||
header_row = lines[0]
|
||
header_cells = [cell.strip() for cell in header_row.split('|')[1:-1]]
|
||
|
||
# 忽略分隔行
|
||
body_rows = lines[2:]
|
||
body_cells_rows = []
|
||
for row in body_rows:
|
||
cells = [cell.strip() for cell in row.split('|')[1:-1]]
|
||
body_cells_rows.append(cells)
|
||
|
||
# 按照要求的格式构建MDX表格
|
||
mdx_table = "<table>\n <thead>\n <tr>\n"
|
||
|
||
# 添加表头
|
||
for cell in header_cells:
|
||
mdx_table += f" <th>{cell}</th>\n"
|
||
|
||
mdx_table += " </tr>\n </thead>\n <tbody>\n"
|
||
|
||
# 添加表体
|
||
for row_cells in body_cells_rows:
|
||
mdx_table += " <tr>\n"
|
||
for cell in row_cells:
|
||
# 先转换Markdown链接为HTML链接
|
||
# 匹配 [text](url) 格式
|
||
link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
|
||
cell = link_pattern.sub(r'<a href="\2">\1</a>', cell)
|
||
|
||
# 替换<br>标签为</p><p>,实现正确的段落分隔
|
||
# 先处理<br>标签(可能有不同形式:<br>, <br/>, <br />)
|
||
br_pattern = re.compile(r'<br\s*/?>')
|
||
|
||
# 处理单元格中的<p>和<br>标签
|
||
if '<p>' in cell or br_pattern.search(cell):
|
||
# 如果已有<p>标签但包含<br>,替换<br>为</p><p>
|
||
if '<p>' in cell and br_pattern.search(cell):
|
||
cell = br_pattern.sub(r'</p>\n <p>', cell)
|
||
# 清理末尾的空<br>标签
|
||
cell = re.sub(r'<br\s*/?>(\s*</p>)', r'\1', cell)
|
||
# 如果没有<p>标签但有<br>,用<p>标签包装每个段落
|
||
elif br_pattern.search(cell) and not '<p>' in cell:
|
||
paragraphs = br_pattern.split(cell)
|
||
cell = '<p>' + '</p>\n <p>'.join([p.strip() for p in paragraphs if p.strip()]) + '</p>'
|
||
|
||
# 确保缩进正确
|
||
mdx_table += f" <td>\n {cell}\n </td>\n"
|
||
else:
|
||
# 普通文本单元格
|
||
mdx_table += f" <td>{cell}</td>\n"
|
||
mdx_table += " </tr>\n"
|
||
|
||
mdx_table += " </tbody>\n</table>"
|
||
|
||
return mdx_table
|
||
|
||
content = table_pattern.sub(table_replacer, content)
|
||
|
||
return content
|
||
|
||
def get_statistics(self):
|
||
"""返回处理统计信息"""
|
||
return {
|
||
"conversion_count": self.conversion_count,
|
||
"error_count": self.error_count
|
||
}
|
||
|
||
def main():
|
||
print("=" * 60)
|
||
print("Gitbook Markdown 转 Mintlify MDX 转换工具")
|
||
print("=" * 60)
|
||
|
||
# 通过交互方式获取输入路径
|
||
input_path_str = input("请输入源文件或目录路径: ")
|
||
input_path = Path(input_path_str)
|
||
|
||
if not input_path.exists():
|
||
print(f"错误: 路径 '{input_path_str}' 不存在!")
|
||
return
|
||
|
||
# 询问是否递归处理子目录
|
||
recursive = False
|
||
if input_path.is_dir():
|
||
recursive_input = input("是否递归处理所有子目录? (y/n): ").lower()
|
||
recursive = recursive_input in ('y', 'yes')
|
||
|
||
# 询问是否创建备份
|
||
backup_input = input("是否创建备份文件? (y/n, 默认:y): ").lower()
|
||
create_backup = backup_input in ('', 'y', 'yes')
|
||
|
||
# 询问是否原地转换并删除源文件
|
||
in_place_input = input("是否在原地转换并删除源文件? (y/n, 默认:n): ").lower()
|
||
in_place = in_place_input in ('y', 'yes')
|
||
|
||
# 确定输出目录
|
||
output_dir = None
|
||
if not in_place:
|
||
if input_path.is_file():
|
||
output_dir = input_path.parent / "output"
|
||
else:
|
||
output_dir = input_path / "output"
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
print(f"输出目录已创建: {output_dir}")
|
||
|
||
# 创建转换器并处理文件
|
||
converter = MarkdownToMDXConverter(backup=create_backup, in_place=in_place)
|
||
|
||
if input_path.is_file() and input_path.suffix.lower() == '.md':
|
||
# 处理单个文件
|
||
if in_place:
|
||
converter._process_file(input_path, input_path.parent, delete_original=True)
|
||
else:
|
||
converter._process_file(input_path, output_dir)
|
||
elif input_path.is_dir():
|
||
# 处理目录
|
||
converter.process_directory(input_path, output_dir, recursive)
|
||
else:
|
||
logger.error(f"无效的输入路径: {input_path_str}")
|
||
print(f"错误: '{input_path_str}' 不是有效的Markdown文件或目录!")
|
||
return
|
||
|
||
# 打印统计信息
|
||
stats = converter.get_statistics()
|
||
print("=" * 60)
|
||
print(f"转换完成! 成功转换: {stats['conversion_count']}个文件, 错误: {stats['error_count']}个文件")
|
||
if not in_place and output_dir:
|
||
print(f"转换结果已保存至: {output_dir}")
|
||
print("=" * 60)
|
||
|
||
if __name__ == "__main__":
|
||
main() |