Files
dify-docs/scripts/doc_migration_helper.py
2025-04-29 14:35:07 +08:00

589 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
文档迁移助手
这个脚本用于辅助 gitbook 文档(dify-docs)迁移至 mintlifydify-docs-mintlify
主要功能包括:
1. 图片路径替换:从原始文档查找并替换为在线图片链接
2. 文档引用路径替换:将相对路径替换为绝对路径
3. 支持交互式确认每个修改
使用方法:
python doc_migration_helper.py <目标文件路径>
例如:
python doc_migration_helper.py /Users/allen/Documents/dify-docs-mintlify/zh-hans/guides/workflow/nodes/parameter-extractor.mdx
"""
import os
import re
import sys
import json
from pathlib import Path
# ANSI 颜色代码
class Colors:
HEADER = '\033[95m'
BLUE = '\033[94m'
CYAN = '\033[96m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
class DocMigrationHelper:
def __init__(self, target_file, source_dir="/Users/allen/Documents/dify-docs",
mintlify_dir="/Users/allen/Documents/dify-docs-mintlify"):
"""
初始化文档迁移助手
Args:
target_file: 要处理的目标文件路径
source_dir: 源文档目录路径
mintlify_dir: mintlify文档目录路径
"""
self.target_file = target_file
self.source_dir = source_dir
self.mintlify_dir = mintlify_dir
# 获取docs.json内容用于路径映射
self.docs_config = self._load_docs_config()
# 解析目标文件的相对路径
self.rel_path = os.path.relpath(target_file, mintlify_dir)
# 推断对应的源文件路径
self.source_file = self._infer_source_file_path()
# 图片映射缓存
self.image_url_cache = {}
def _load_docs_config(self):
"""加载docs.json配置文件"""
try:
docs_config_path = os.path.join(self.mintlify_dir, "docs.json")
with open(docs_config_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"{Colors.RED}无法加载docs.json: {e}{Colors.ENDC}")
return {}
def _infer_source_file_path(self):
"""推断源文件路径"""
# 从mintlify路径推断原始文档中对应的路径
parts = self.rel_path.split(os.sep)
# 处理语言差异 (zh-hans -> zh_CN)
if parts[0] == "zh-hans":
lang_dir = "zh_CN"
elif parts[0] == "en":
lang_dir = "en_US"
else:
lang_dir = parts[0]
# 实际目标文件名称
target_basename = os.path.basename(self.target_file)
if target_basename.endswith(".mdx"):
target_basename = target_basename[:-4]
# 收集可能的路径
potential_paths = []
# 处理文件扩展名 (.mdx -> )
rest_path = os.path.join(*parts[1:])
if rest_path.endswith(".mdx"):
rest_path = rest_path[:-4] + ".md"
# 1. 直接对应路径
direct_path = os.path.join(self.source_dir, lang_dir, rest_path)
potential_paths.append(direct_path)
# 2. 处理节点路径差异 (nodes -> node)
node_path = direct_path.replace("nodes", "node")
if node_path != direct_path:
potential_paths.append(node_path)
# 3. 可能添加了 guides 前缀
guides_path = os.path.join(self.source_dir, lang_dir, "guides", rest_path)
if guides_path != direct_path:
potential_paths.append(guides_path)
# 也考虑 guides 和 node 的组合
guides_node_path = guides_path.replace("nodes", "node")
if guides_node_path != guides_path:
potential_paths.append(guides_node_path)
# 4. 如果是工作流节点文件,尝试特定目录
if "workflow" in rest_path and "nodes" in rest_path:
workflow_node_path = os.path.join(self.source_dir, lang_dir, "guides", "workflow", "node", target_basename + ".md")
potential_paths.append(workflow_node_path)
# 先检查所有可能的直接匹配路径
for path in potential_paths:
if os.path.exists(path):
print(f"{Colors.GREEN}找到源文件: {path}{Colors.ENDC}")
return path
# 如果上面的匹配都失败,尝试一些含有繁体/简体变体的目录
if "workflow" in rest_path and "nodes" in rest_path:
# 尝试搜索node目录
node_dir = os.path.join(self.source_dir, lang_dir, "guides", "workflow", "node")
if os.path.exists(node_dir):
# 对比文件名,考虑字符替换(如 - 和 _
target_name_variants = [
target_basename,
target_basename.replace("-", "_"),
target_basename.replace("_", "-")
]
for file in os.listdir(node_dir):
if file.endswith(".md"):
file_basename = os.path.splitext(file)[0]
# 尝试各种变体
for variant in target_name_variants:
if file_basename == variant:
found_path = os.path.join(node_dir, file)
print(f"{Colors.GREEN}找到匹配的源文件: {found_path}{Colors.ENDC}")
return found_path
# 如果仍然找不到,尝试搜索整个文档目录
print(f"{Colors.YELLOW}尝试搜索整个文档目录...{Colors.ENDC}")
found_files = []
for root, _, files in os.walk(os.path.join(self.source_dir, lang_dir)):
for file in files:
if file.endswith(".md"):
file_basename = os.path.splitext(file)[0]
# 比较文件名的各种变体
if (file_basename == target_basename or
file_basename == target_basename.replace("-", "_") or
file_basename == target_basename.replace("_", "-")):
found_files.append(os.path.join(root, file))
if found_files:
# 如果找到多个文件,选择路径最相似的
if len(found_files) > 1:
best_match = None
best_score = -1
current_parts = rest_path.split(os.sep)
for file_path in found_files:
rel_path = os.path.relpath(file_path, self.source_dir)
rel_parts = rel_path.split(os.sep)
# 计算路径部分的重叠数量
score = sum(1 for a, b in zip(current_parts, rel_parts) if a == b or a.replace("nodes", "node") == b)
if score > best_score:
best_score = score
best_match = file_path
print(f"{Colors.GREEN}找到最匹配的源文件: {best_match}{Colors.ENDC}")
return best_match
else:
print(f"{Colors.GREEN}找到源文件: {found_files[0]}{Colors.ENDC}")
return found_files[0]
print(f"{Colors.YELLOW}无法找到对应的源文件{Colors.ENDC}")
return None
def get_corresponding_image_url(self, local_path):
"""
根据本地图片路径找到对应的在线URL
Args:
local_path: 本地图片路径,例如 /zh-hans/user-guide/.gitbook/assets/image (66).png
Returns:
online_url: 在线图片URL
"""
# 如果已经缓存过,直接返回
if local_path in self.image_url_cache:
return self.image_url_cache[local_path]
# 获取本地图片文件名和图片序号
local_img_name = os.path.basename(local_path)
img_number_match = re.search(r'\((\d+)\)', local_img_name)
img_number = img_number_match.group(1) if img_number_match else None
# 直接尝试根据目标文件路径推断对应的源文件
if not self.source_file:
print(f"{Colors.YELLOW}无法找到对应的源文件,尝试查找相关文件...{Colors.ENDC}")
# 尝试从目标文件名推断源文件名
target_basename = os.path.basename(self.target_file).replace('.mdx', '')
# 构建可能的源文件路径
parts = self.rel_path.split(os.sep)
if parts[0] == "zh-hans":
lang_dir = "zh_CN"
elif parts[0] == "en":
lang_dir = "en_US"
else:
lang_dir = parts[0]
# 尝试在guides/workflow/node目录下查找
possible_source_dir = os.path.join(self.source_dir, lang_dir, "guides", "workflow", "node")
if os.path.exists(possible_source_dir):
for file in os.listdir(possible_source_dir):
if file.endswith(".md") and file.startswith(target_basename.replace("-", "_")):
self.source_file = os.path.join(possible_source_dir, file)
print(f"{Colors.GREEN}找到可能的源文件: {self.source_file}{Colors.ENDC}")
break
# 如果找不到源文件,尝试在整个文档中搜索图片
if not self.source_file or not os.path.exists(self.source_file):
print(f"{Colors.YELLOW}尝试在整个文档中搜索图片...{Colors.ENDC}")
# 搜索整个源目录中的所有.md文件
all_md_files = []
for root, _, files in os.walk(os.path.join(self.source_dir, "zh_CN")):
for file in files:
if file.endswith(".md"):
all_md_files.append(os.path.join(root, file))
# 在所有文件中搜索图片URL
for md_file in all_md_files:
try:
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
# 查找图片序号匹配
if img_number:
# 查找包含特定序号的图片
url_matches = re.findall(r'!\[.*?\]\((https://assets-docs\.dify\.ai/[^)]+)\)', content)
for url in url_matches:
# 如果URL包含图片名字的关键部分可能是匹配项
if url.endswith(".png") or url.endswith(".jpg") or url.endswith(".jpeg") or url.endswith(".gif"):
self.image_url_cache[local_path] = url
print(f"{Colors.GREEN}在文件 {md_file} 中找到可能匹配的图片URL: {url}{Colors.ENDC}")
return url
except Exception as e:
continue
# 如果找不到返回构造的URL
# 默认路径构造
if parts[0] == "zh-hans":
constructed_url = f"https://assets-docs.dify.ai/dify-enterprise-mintlify/zh_CN/guides/workflow/node/{img_number}.png"
print(f"{Colors.YELLOW}未找到匹配图片构造URL: {constructed_url}{Colors.ENDC}")
return constructed_url
return None
# 读取源文件内容
try:
with open(self.source_file, 'r', encoding='utf-8') as f:
source_content = f.read()
# 在源文件中查找图片链接
online_urls = re.findall(r'!\[.*?\]\((https://assets-docs\.dify\.ai/[^)]+)\)', source_content)
# 首先尝试基于图片序号匹配
if img_number:
for url in online_urls:
# 检查URL是否包含相同序号或相似模式
if f"{img_number}.png" in url or f"{img_number})" in url:
self.image_url_cache[local_path] = url
return url
# 然后尝试文件名匹配
for url in online_urls:
url_basename = os.path.basename(url)
# 精确匹配
if url_basename == local_img_name:
self.image_url_cache[local_path] = url
return url
# 尝试模糊匹配(移除数字和特殊字符后比较)
clean_local = re.sub(r'[^a-zA-Z]', '', local_img_name)
clean_url = re.sub(r'[^a-zA-Z]', '', url_basename)
if clean_local and clean_url and clean_local == clean_url:
self.image_url_cache[local_path] = url
return url
# 查找Frame组件中的图片
frame_urls = re.findall(r'<Frame[^>]*>.*?<img[^>]*src="(https://assets-docs\.dify\.ai/[^"]+)".*?</Frame>',
source_content, re.DOTALL)
for url in frame_urls:
url_basename = os.path.basename(url)
if url_basename == local_img_name or re.sub(r'[^a-zA-Z]', '', url_basename) == re.sub(r'[^a-zA-Z]', '', local_img_name):
self.image_url_cache[local_path] = url
return url
# 如果在源文件中找不到匹配的URL尝试在相关文件中查找
related_files = []
source_dir = os.path.dirname(self.source_file)
for file in os.listdir(source_dir):
if file.endswith(".md") and file != os.path.basename(self.source_file):
related_files.append(os.path.join(source_dir, file))
for related_file in related_files:
try:
with open(related_file, 'r', encoding='utf-8') as f:
related_content = f.read()
related_urls = re.findall(r'!\[.*?\]\((https://assets-docs\.dify\.ai/[^)]+)\)', related_content)
for url in related_urls:
if img_number and (f"{img_number}.png" in url or f"{img_number})" in url):
self.image_url_cache[local_path] = url
print(f"{Colors.GREEN}在相关文件 {related_file} 中找到匹配图片: {url}{Colors.ENDC}")
return url
except Exception as e:
continue
# 最后尝试根据目录结构构造URL
relative_source_path = os.path.relpath(self.source_file, self.source_dir)
dir_parts = os.path.dirname(relative_source_path).split(os.sep)
if img_number and len(dir_parts) >= 2:
# 使用目录结构构造可能的URL
if dir_parts[0] == "zh_CN":
constructed_url = f"https://assets-docs.dify.ai/dify-enterprise-mintlify/{dir_parts[0]}/{'/'.join(dir_parts[1:])}/{img_number}.png"
print(f"{Colors.YELLOW}未找到匹配图片构造URL: {constructed_url}{Colors.ENDC}")
return constructed_url
return None
except Exception as e:
print(f"{Colors.RED}读取源文件时出错: {e}{Colors.ENDC}")
return None
def get_absolute_doc_path(self, relative_path):
"""
将相对文档路径转换为绝对路径
Args:
relative_path: 相对路径,例如 ./iteration.md 或 http-request.md
Returns:
absolute_path: 绝对路径,例如 /zh-hans/guides/workflow/nodes/iteration
"""
# 如果已经是绝对路径,直接返回
if relative_path.startswith('/'):
return relative_path
# 如果是外部链接,直接返回
if relative_path.startswith(('http://', 'https://')):
return relative_path
# 提取锁点信息(如果有的话)
fragment = ""
if '#' in relative_path:
relative_path, fragment = relative_path.split('#', 1)
fragment = f'#{fragment}'
# 移除.md或.mdx扩展名
if relative_path.endswith(('.md', '.mdx')):
extension = '.md' if relative_path.endswith('.md') else '.mdx'
relative_path = relative_path[:-len(extension)]
# 获取当前文件的语言前缀(例如 zh-hans
lang_prefix = self.rel_path.split(os.sep)[0]
# 处理相对路径
current_dir = os.path.dirname(self.rel_path)
current_dir_parts = current_dir.split(os.sep)
# 根据不同类型的相对路径进行处理
if relative_path.startswith('./'):
# ./file.md 形式
relative_path = relative_path[2:]
full_path = os.path.normpath(os.path.join(current_dir, relative_path))
elif relative_path.startswith('../'):
# ../file.md 形式
full_path = os.path.normpath(os.path.join(current_dir, relative_path))
else:
# 简单名称 file.md 形式
# 首先检查是否在同一目录下
basename = os.path.basename(relative_path)
same_level_path = os.path.normpath(os.path.join(current_dir, basename))
# 检查实际文件是否存在
if os.path.exists(os.path.join(self.mintlify_dir, same_level_path + '.mdx')):
full_path = same_level_path
else:
# 如果是节点文件,通常在 /nodes/ 目录下
# 查找是否在当前语言的 workflow/nodes 目录下
if "workflow" in current_dir and ("node" in current_dir or "nodes" in current_dir):
# 构造可能的节点路径
possible_path = f"{lang_prefix}/guides/workflow/nodes/{basename}"
if os.path.exists(os.path.join(self.mintlify_dir, possible_path + '.mdx')):
full_path = possible_path
else:
# 如果不存在,使用默认的同级目录路径
full_path = same_level_path
print(f"{Colors.YELLOW}警告: 无法找到文件 {possible_path}.mdx使用默认路径{Colors.ENDC}")
else:
# 尝试搜索整个 mintlify 目录
matches = []
for root, _, files in os.walk(os.path.join(self.mintlify_dir, lang_prefix)):
for file in files:
if file == f"{basename}.mdx" or file == f"{basename}.md":
rel_file_path = os.path.relpath(os.path.join(root, file), self.mintlify_dir)
# 移除扩展名
rel_file_path = os.path.splitext(rel_file_path)[0]
matches.append(rel_file_path)
if matches:
# 如果找到多个匹配,选择与当前目录最相似的
if len(matches) > 1:
best_match = None
best_score = -1
for match in matches:
match_parts = match.split(os.sep)
# 计算路径部分的重叠数量
score = sum(1 for a, b in zip(current_dir_parts, match_parts[1:]) if a == b)
if score > best_score:
best_score = score
best_match = match
full_path = best_match
else:
full_path = matches[0]
else:
# 如果找不到匹配的文件,使用默认的同级目录路径
full_path = same_level_path
print(f"{Colors.YELLOW}警告: 无法找到文件 {basename}.mdx使用默认路径{Colors.ENDC}")
# 确保路径以 / 开头
if not full_path.startswith('/'):
full_path = '/' + full_path
# 添加锁点(如果有的话)
return full_path + fragment
def process_file(self):
"""处理文件,替换图片路径和文档引用路径"""
try:
# 读取目标文件内容
with open(self.target_file, 'r', encoding='utf-8') as f:
content = f.read()
# 存储修改项
changes = []
# 1. 查找并替换Markdown格式图片
# ![alt text](/zh-hans/user-guide/.gitbook/assets/image.png)
md_img_pattern = re.compile(r'!\[([^\]]*)\]\((/[^)]+)\)')
for match in md_img_pattern.finditer(content):
alt_text = match.group(1)
local_path = match.group(2)
full_match = match.group(0)
# 获取对应的在线URL
online_url = self.get_corresponding_image_url(local_path)
if online_url:
new_text = f'![{alt_text}]({online_url})'
changes.append((full_match, new_text, '图片链接'))
# 2. 查找并替换Frame组件中的图片
frame_img_pattern = re.compile(r'(<Frame[^>]*>[\s\S]*?<img[^>]*src=")(/[^"]+)("[^>]*>[\s\S]*?</Frame>)')
for match in frame_img_pattern.finditer(content):
prefix = match.group(1)
local_path = match.group(2)
suffix = match.group(3)
full_match = match.group(0)
# 获取对应的在线URL
online_url = self.get_corresponding_image_url(local_path)
if online_url:
new_text = f'{prefix}{online_url}{suffix}'
changes.append((full_match, new_text, 'Frame组件图片'))
# 3. 查找并替换文档引用链接
# [link text](./path/to/file) 或 [link text](path/to/file)
doc_link_pattern = re.compile(r'\[([^\]]+)\]\((\./[^)]+\.md(?:#[^)]*)?|\.\./[^)]+\.md(?:#[^)]*)?|[^)]+\.md(?:#[^)]*)?)\)')
for match in doc_link_pattern.finditer(content):
link_text = match.group(1)
rel_path = match.group(2)
full_match = match.group(0)
# 检查是否包含锚点
fragment = ""
if '#' in rel_path:
rel_path, fragment = rel_path.split('#', 1)
fragment = f'#{fragment}'
# 获取绝对路径
abs_path = self.get_absolute_doc_path(rel_path)
if abs_path:
new_text = f'[{link_text}]({abs_path}{fragment})'
changes.append((full_match, new_text, '文档链接'))
# 如果没有需要修改的内容
if not changes:
print(f"{Colors.GREEN}文件不需要修改{Colors.ENDC}")
return True
# 显示找到的修改项
print(f"\n{Colors.BLUE}找到 {len(changes)} 个需要修改的内容:{Colors.ENDC}")
for i, (old, new, change_type) in enumerate(changes):
print(f"{Colors.CYAN}修改 {i+1} ({change_type}):{Colors.ENDC}")
print(f" - 原始内容: {Colors.YELLOW}{old[:100]}{'...' if len(old) > 100 else ''}{Colors.ENDC}")
print(f" - 新内容: {Colors.GREEN}{new[:100]}{'...' if len(new) > 100 else ''}{Colors.ENDC}")
print()
# 询问是否执行修改
selected_changes = []
response = input(f"{Colors.BOLD}是否应用这些修改? (y/n/部分修改输入数字如1,3,5): {Colors.ENDC}")
if response.lower() == 'n':
print(f"{Colors.BLUE}已取消修改{Colors.ENDC}")
return False
elif response.lower() == 'y':
selected_changes = changes
else:
try:
# 解析用户选择的修改索引
indices = [int(i.strip()) - 1 for i in response.split(',')]
selected_changes = [changes[i] for i in indices if 0 <= i < len(changes)]
if not selected_changes:
print(f"{Colors.YELLOW}未选择任何有效修改,操作取消{Colors.ENDC}")
return False
except:
print(f"{Colors.YELLOW}输入格式有误,操作取消{Colors.ENDC}")
return False
# 应用修改
modified_content = content
for old, new, _ in selected_changes:
modified_content = modified_content.replace(old, new)
# 写入文件
with open(self.target_file, 'w', encoding='utf-8') as f:
f.write(modified_content)
print(f"{Colors.GREEN}成功应用 {len(selected_changes)} 个修改到文件{Colors.ENDC}")
return True
except Exception as e:
print(f"{Colors.RED}处理文件时出错: {e}{Colors.ENDC}")
import traceback
traceback.print_exc()
return False
def main():
"""主函数"""
# 检查命令行参数
if len(sys.argv) != 2:
print(f"用法: {sys.argv[0]} <目标文件路径>")
print(f"例如: {sys.argv[0]} /Users/allen/Documents/dify-docs-mintlify/zh-hans/guides/workflow/nodes/parameter-extractor.mdx")
return
# 获取目标文件路径
target_file = sys.argv[1]
# 检查文件是否存在
if not os.path.isfile(target_file):
print(f"{Colors.RED}文件不存在: {target_file}{Colors.ENDC}")
return
# 初始化并处理文件
helper = DocMigrationHelper(target_file)
print(f"{Colors.HEADER}开始处理文件: {target_file}{Colors.ENDC}")
print(f"对应的源文件: {helper.source_file or '未找到'}")
helper.process_file()
if __name__ == "__main__":
main()