mirror of
https://github.com/langgenius/dify-docs.git
synced 2026-03-27 13:28:32 +07:00
Docs: update docs image link
This commit is contained in:
@@ -3,15 +3,37 @@
|
||||
Mintlify图片格式转换工具
|
||||
|
||||
这个脚本用于扫描dify-docs-mintlify目录中的所有.mdx文件,
|
||||
并将<Frame>标签中的图片转换为标准Markdown格式。
|
||||
并将<Frame>标签中的图片转换为标准Markdown格式或HTML格式。
|
||||
|
||||
转换前:
|
||||
<Frame caption="示例标题">
|
||||
<img src="https://assets-docs.dify.ai/example.png" alt="示例" />
|
||||
</Frame>
|
||||
支持以下转换:
|
||||
1. 基本Frame转Markdown:
|
||||
<Frame caption="标题">
|
||||
<img src="https://example.com/image.png" alt="描述" />
|
||||
</Frame>
|
||||
|
||||
转换为:
|
||||

|
||||
|
||||
转换后:
|
||||

|
||||
2. 带自闭合标签的Frame:
|
||||
<Frame>
|
||||
<img src="https://example.com/image.png" alt="" / >
|
||||
</Frame>
|
||||
|
||||
转换为:
|
||||

|
||||
|
||||
3. 带宽度的Frame转HTML:
|
||||
<Frame caption="标题" width="369">
|
||||
<img src="https://example.com/image.png" alt="描述" />
|
||||
</Frame>
|
||||
|
||||
转换为:
|
||||
<img
|
||||
src="https://example.com/image.png"
|
||||
width="369"
|
||||
className="mx-auto"
|
||||
alt="描述"
|
||||
/>
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -33,12 +55,32 @@ class Colors:
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
|
||||
# 匹配Frame标签中的图片
|
||||
FRAME_IMG_PATTERN = re.compile(r'<Frame(?:\s+caption="([^"]*)")?\s*>\s*<img\s+src="([^"]+)"(?:\s+alt="([^"]*)")?\s*/?>\s*</Frame>', re.DOTALL)
|
||||
# 匹配Frame标签中的图片,处理多种情况:
|
||||
# 1. 标准Frame标签: <Frame>...<img src="..." alt="..." />...</Frame>
|
||||
# 2. 特殊结束的img标签: <Frame>...<img src="..." alt="" / >...</Frame>
|
||||
# 3. 支持特殊字符和空格
|
||||
FRAME_IMG_PATTERN = re.compile(
|
||||
r'<Frame(?:\s+caption="([^"]*)")?(?:\s+width="([^"]*)")?\s*>\s*'
|
||||
r'<img\s+src="([^"]+)"(?:\s+alt="([^"]*)")?\s*(?:\/\s*>|\/ >|>\s*<\/img>)\s*'
|
||||
r'<\/Frame>',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
def convert_frame_to_markdown(content: str) -> Tuple[str, List[Tuple[str, str]]]:
|
||||
# 另一种每行写一个属性的格式,更付特征,匹配如
|
||||
# <img
|
||||
# src="https://example.com/image.png"
|
||||
# width="369"
|
||||
# className="mx-auto"
|
||||
# alt="metadata_field"
|
||||
# />
|
||||
HTML_IMG_PATTERN = re.compile(
|
||||
r'<img\s*\n\s*src="([^"]+)"\s*\n\s*(?:width="([^"]+)"\s*\n\s*)?(?:className="[^"]*"\s*\n\s*)?(?:alt="([^"]*)"\s*\n\s*)?\/>',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
def convert_frame_to_markdown(content: str) -> Tuple[str, List[Tuple[str, str, str]]]:
|
||||
"""
|
||||
将Frame标签中的图片转换为Markdown格式
|
||||
将Frame标签中的图片转换为Markdown或HTML格式
|
||||
|
||||
Args:
|
||||
content: 文件内容
|
||||
@@ -50,32 +92,67 @@ def convert_frame_to_markdown(content: str) -> Tuple[str, List[Tuple[str, str]]]
|
||||
|
||||
def replace_frame(match):
|
||||
caption = match.group(1) or ""
|
||||
src = match.group(2)
|
||||
alt = match.group(3) or caption or ""
|
||||
width = match.group(2) # 可能为None
|
||||
src = match.group(3)
|
||||
alt = match.group(4) or caption or ""
|
||||
|
||||
# 原始内容
|
||||
original = match.group(0)
|
||||
|
||||
# 转换为Markdown格式
|
||||
markdown = f""
|
||||
# 转换格式
|
||||
if width:
|
||||
# 带宽度的转为HTML格式
|
||||
new_format = "HTML"
|
||||
markdown = f"""<img
|
||||
src="{src}"
|
||||
width="{width}"
|
||||
className="mx-auto"
|
||||
alt="{alt}"
|
||||
/>"""
|
||||
else:
|
||||
# 不带宽度的转为Markdown格式
|
||||
new_format = "Markdown"
|
||||
markdown = f""
|
||||
|
||||
# 记录替换
|
||||
replacements.append((original, markdown))
|
||||
replacements.append((original, markdown, new_format))
|
||||
|
||||
return markdown
|
||||
|
||||
# 执行替换
|
||||
# 先处理Frame标签
|
||||
new_content = FRAME_IMG_PATTERN.sub(replace_frame, content)
|
||||
|
||||
return new_content, replacements
|
||||
# 再处理HTML格式的img标签
|
||||
def replace_html_img(match):
|
||||
src = match.group(1)
|
||||
width = match.group(2) # 可能为None
|
||||
alt = match.group(3) or ""
|
||||
|
||||
# 原始内容
|
||||
original = match.group(0)
|
||||
|
||||
# HTML格式的图片保持为HTML格式,但直接转为Markdown
|
||||
new_format = "Markdown"
|
||||
markdown = f""
|
||||
|
||||
# 记录替换
|
||||
replacements.append((original, markdown, new_format))
|
||||
|
||||
return markdown
|
||||
|
||||
# 处理HTML格式的img标签
|
||||
final_content = HTML_IMG_PATTERN.sub(replace_html_img, new_content)
|
||||
|
||||
return final_content, replacements
|
||||
|
||||
def process_file(file_path: str, dry_run: bool = False) -> Tuple[int, List[Tuple[str, str]]]:
|
||||
def process_file(file_path: str, dry_run: bool = False, debug: bool = False) -> Tuple[int, List[Tuple[str, str, str]]]:
|
||||
"""
|
||||
处理单个文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
dry_run: 是否只预览修改而不实际写入
|
||||
debug: 是否显示调试信息
|
||||
|
||||
Returns:
|
||||
Tuple[替换的数量, 替换记录列表]
|
||||
@@ -84,6 +161,45 @@ def process_file(file_path: str, dry_run: bool = False) -> Tuple[int, List[Tuple
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
if debug:
|
||||
# 直接检查正则达式匹配
|
||||
frame_matches = FRAME_IMG_PATTERN.findall(content)
|
||||
html_matches = HTML_IMG_PATTERN.findall(content)
|
||||
|
||||
print(f"\n{Colors.CYAN}匹配结果调试信息:{Colors.ENDC}")
|
||||
print(f"在文件 {file_path} 中找到:")
|
||||
print(f"- {len(frame_matches)} 个Frame标签中的图片")
|
||||
print(f"- {len(html_matches)} 个HTML格式的图片")
|
||||
|
||||
# 打印Frame标签匹配
|
||||
for i, match in enumerate(frame_matches):
|
||||
caption, width, src, alt = match
|
||||
print(f"\n Frame图片 {i+1}:")
|
||||
print(f" caption: '{caption}'")
|
||||
print(f" width: '{width}'")
|
||||
print(f" src: '{src}'")
|
||||
print(f" alt: '{alt}'")
|
||||
|
||||
# 显示原始文本片段
|
||||
pattern_matches = list(FRAME_IMG_PATTERN.finditer(content))
|
||||
if i < len(pattern_matches):
|
||||
orig_text = pattern_matches[i].group(0)
|
||||
print(f" 原始文本: '{orig_text[:100]}...'")
|
||||
|
||||
# 打印HTML图片匹配
|
||||
for i, match in enumerate(html_matches):
|
||||
src, width, alt = match
|
||||
print(f"\n HTML图片 {i+1}:")
|
||||
print(f" src: '{src}'")
|
||||
print(f" width: '{width}'")
|
||||
print(f" alt: '{alt}'")
|
||||
|
||||
# 显示原始文本片段
|
||||
pattern_matches = list(HTML_IMG_PATTERN.finditer(content))
|
||||
if i < len(pattern_matches):
|
||||
orig_text = pattern_matches[i].group(0)
|
||||
print(f" 原始文本: '{orig_text[:100]}...'")
|
||||
|
||||
# 转换内容
|
||||
new_content, replacements = convert_frame_to_markdown(content)
|
||||
|
||||
@@ -97,29 +213,48 @@ def process_file(file_path: str, dry_run: bool = False) -> Tuple[int, List[Tuple
|
||||
print(f"{Colors.FAIL}处理文件时出错 {file_path}: {e}{Colors.ENDC}")
|
||||
return 0, []
|
||||
|
||||
def scan_directory(dir_path: str, dry_run: bool = False, extensions: List[str] = ['.mdx']) -> Tuple[int, int, int]:
|
||||
def scan_directory(dir_path: str, dry_run: bool = False, auto_confirm: bool = False, debug: bool = False, extensions: List[str] = ['.mdx']) -> Tuple[int, int, int, int]:
|
||||
"""
|
||||
扫描目录并处理文件
|
||||
|
||||
Args:
|
||||
dir_path: 目录路径
|
||||
dry_run: 是否只预览修改而不实际写入
|
||||
auto_confirm: 是否自动确认所有修改
|
||||
debug: 是否显示调试信息
|
||||
extensions: 要处理的文件扩展名列表
|
||||
|
||||
Returns:
|
||||
Tuple[处理的文件数, 包含Frame的文件数, 替换的总数]
|
||||
Tuple[处理的文件数, 修改的文件数, 转为Markdown的数量, 转为HTML的数量]
|
||||
"""
|
||||
file_count = 0
|
||||
modified_file_count = 0
|
||||
total_replacements = 0
|
||||
markdown_count = 0
|
||||
html_count = 0
|
||||
|
||||
for root, _, files in os.walk(dir_path):
|
||||
for root, dirs, files in os.walk(dir_path):
|
||||
# 跳过.git等特殊目录
|
||||
dirs[:] = [d for d in dirs if not d.startswith('.')]
|
||||
|
||||
for file in files:
|
||||
if any(file.endswith(ext) for ext in extensions):
|
||||
file_path = os.path.join(root, file)
|
||||
rel_path = os.path.relpath(file_path, dir_path)
|
||||
|
||||
# 如果不是自动确认模式,则询问是否处理此文件
|
||||
if not auto_confirm:
|
||||
print(f"\n{Colors.CYAN}文件 ({file_count+1}): {rel_path}{Colors.ENDC}")
|
||||
response = input(f"{Colors.BOLD}是否处理此文件? (y/n/q-退出): {Colors.ENDC}")
|
||||
if response.lower() == 'n':
|
||||
print(f"{Colors.BLUE}跳过此文件{Colors.ENDC}")
|
||||
file_count += 1
|
||||
continue
|
||||
elif response.lower() == 'q':
|
||||
print(f"{Colors.BLUE}退出处理{Colors.ENDC}")
|
||||
break
|
||||
|
||||
# 处理文件
|
||||
count, replacements = process_file(file_path, dry_run)
|
||||
count, replacements = process_file(file_path, dry_run, debug)
|
||||
|
||||
file_count += 1
|
||||
if count > 0:
|
||||
@@ -127,17 +262,22 @@ def scan_directory(dir_path: str, dry_run: bool = False, extensions: List[str] =
|
||||
print(f"\n{Colors.CYAN}文件: {rel_path}{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}找到 {count} 个需要转换的图片{Colors.ENDC}")
|
||||
|
||||
# 统计转换类型
|
||||
md_count = sum(1 for _, _, fmt in replacements if fmt == "Markdown")
|
||||
html_count = sum(1 for _, _, fmt in replacements if fmt == "HTML")
|
||||
|
||||
# 显示替换详情
|
||||
for i, (original, markdown) in enumerate(replacements):
|
||||
for i, (original, converted, format_type) in enumerate(replacements):
|
||||
# 为了简洁,截断过长的内容
|
||||
orig_short = original[:100] + "..." if len(original) > 100 else original
|
||||
print(f" {i+1}. {Colors.WARNING}{orig_short}{Colors.ENDC}")
|
||||
print(f" -> {Colors.GREEN}{markdown}{Colors.ENDC}")
|
||||
print(f" {i+1}. {Colors.WARNING}[{format_type}] {orig_short}{Colors.ENDC}")
|
||||
print(f" -> {Colors.GREEN}{converted}{Colors.ENDC}")
|
||||
|
||||
modified_file_count += 1
|
||||
total_replacements += count
|
||||
markdown_count += md_count
|
||||
html_count += html_count
|
||||
|
||||
return file_count, modified_file_count, total_replacements
|
||||
return file_count, modified_file_count, markdown_count, html_count
|
||||
|
||||
def main():
|
||||
"""主程序入口"""
|
||||
@@ -150,52 +290,97 @@ def main():
|
||||
print(f"{Colors.HEADER} Mintlify图片格式转换工具 {Colors.ENDC}")
|
||||
print(f"{Colors.HEADER}{'='*60}{Colors.ENDC}")
|
||||
|
||||
# 参数处理
|
||||
if len(sys.argv) > 1:
|
||||
if sys.argv[1] == '--help' or sys.argv[1] == '-h':
|
||||
print("\n使用方法:")
|
||||
print(" python convert_image_format.py [目录路径] [选项]")
|
||||
print("\n选项:")
|
||||
print(" --dry-run 仅预览修改,不实际写入")
|
||||
print(" --help, -h 显示此帮助信息")
|
||||
return
|
||||
# 交互式菜单
|
||||
while True:
|
||||
print(f"\n{Colors.BOLD}请选择操作模式:{Colors.ENDC}")
|
||||
print("1. 处理单个文件")
|
||||
print("2. 处理指定目录中的所有文件")
|
||||
print("3. 退出")
|
||||
|
||||
choice = input(f"{Colors.BOLD}请输入选项 (1-3): {Colors.ENDC}")
|
||||
|
||||
if choice == '1':
|
||||
# 处理单个文件
|
||||
file_path = input(f"{Colors.BOLD}请输入文件路径 (绝对或相对路径): {Colors.ENDC}")
|
||||
|
||||
# 如果是相对路径,则基于默认目录
|
||||
if not os.path.isabs(file_path):
|
||||
file_path = os.path.join(default_dir, file_path)
|
||||
|
||||
if not os.path.isfile(file_path):
|
||||
print(f"{Colors.FAIL}错误: 文件不存在: {file_path}{Colors.ENDC}")
|
||||
continue
|
||||
|
||||
# 询问是否只预览修改
|
||||
preview = input(f"{Colors.BOLD}是否只预览修改而不实际写入? (y/n): {Colors.ENDC}").lower() == 'y'
|
||||
|
||||
# 询问是否显示调试信息
|
||||
debug = input(f"{Colors.BOLD}是否显示调试信息? (y/n): {Colors.ENDC}").lower() == 'y'
|
||||
|
||||
# 处理文件
|
||||
start_time = time.time()
|
||||
count, replacements = process_file(file_path, preview, debug)
|
||||
end_time = time.time()
|
||||
|
||||
if count > 0:
|
||||
# 统计转换类型
|
||||
md_count = sum(1 for _, _, fmt in replacements if fmt == "Markdown")
|
||||
html_count = sum(1 for _, _, fmt in replacements if fmt == "HTML")
|
||||
|
||||
print(f"\n{Colors.GREEN}处理完成! 耗时: {end_time - start_time:.2f}秒{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}发现 {count} 个需要转换的图片:{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}- 转换为Markdown格式: {md_count}{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}- 转换为HTML格式: {html_count}{Colors.ENDC}")
|
||||
|
||||
if preview:
|
||||
print(f"\n{Colors.BLUE}这是预览模式,没有实际写入修改。{Colors.ENDC}")
|
||||
else:
|
||||
print(f"{Colors.BLUE}没有找到需要转换的图片{Colors.ENDC}")
|
||||
|
||||
elif choice == '2':
|
||||
# 处理目录
|
||||
dir_path = input(f"{Colors.BOLD}请输入目录路径 (绝对或相对路径): {Colors.ENDC}")
|
||||
|
||||
# 如果是相对路径,则基于默认目录
|
||||
if not os.path.isabs(dir_path):
|
||||
dir_path = os.path.join(default_dir, dir_path)
|
||||
|
||||
if not os.path.isdir(dir_path):
|
||||
print(f"{Colors.FAIL}错误: 目录不存在: {dir_path}{Colors.ENDC}")
|
||||
continue
|
||||
|
||||
# 询问是否只预览修改
|
||||
preview = input(f"{Colors.BOLD}是否只预览修改而不实际写入? (y/n): {Colors.ENDC}").lower() == 'y'
|
||||
|
||||
# 询问是否自动确认所有修改
|
||||
auto_confirm = input(f"{Colors.BOLD}是否自动确认所有修改? (y/n): {Colors.ENDC}").lower() == 'y'
|
||||
|
||||
# 询问是否显示调试信息
|
||||
debug = input(f"{Colors.BOLD}是否显示调试信息? (y/n): {Colors.ENDC}").lower() == 'y'
|
||||
|
||||
# 开始处理
|
||||
start_time = time.time()
|
||||
file_count, modified_file_count, markdown_count, html_count = scan_directory(dir_path, preview, auto_confirm=auto_confirm, debug=debug)
|
||||
end_time = time.time()
|
||||
|
||||
# 显示总结
|
||||
print(f"\n{Colors.HEADER}{'='*60}{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}处理完成! 耗时: {end_time - start_time:.2f}秒{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}扫描了 {file_count} 个文件{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}修改了 {modified_file_count} 个文件中的图片格式{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}- 转换为Markdown格式: {markdown_count}{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}- 转换为HTML格式: {html_count}{Colors.ENDC}")
|
||||
|
||||
if preview:
|
||||
print(f"\n{Colors.BLUE}这是预览模式,没有实际写入修改。{Colors.ENDC}")
|
||||
|
||||
elif choice == '3':
|
||||
# 退出
|
||||
print(f"{Colors.BLUE}感谢使用,再见!{Colors.ENDC}")
|
||||
break
|
||||
|
||||
if os.path.exists(sys.argv[1]) and os.path.isdir(sys.argv[1]):
|
||||
target_dir = sys.argv[1]
|
||||
else:
|
||||
print(f"{Colors.FAIL}错误: 无效的目录路径: {sys.argv[1]}{Colors.ENDC}")
|
||||
return
|
||||
else:
|
||||
# 使用默认目录
|
||||
target_dir = default_dir
|
||||
|
||||
# 检查是否为预览模式
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
print(f"目标目录: {target_dir}")
|
||||
print(f"预览模式: {'是' if dry_run else '否'}\n")
|
||||
|
||||
# 确认操作
|
||||
if not dry_run:
|
||||
response = input(f"{Colors.BOLD}这将修改所有.mdx文件中的图片格式。确认继续? (y/n): {Colors.ENDC}")
|
||||
if response.lower() != 'y':
|
||||
print(f"{Colors.BLUE}操作已取消{Colors.ENDC}")
|
||||
return
|
||||
|
||||
# 开始处理
|
||||
start_time = time.time()
|
||||
file_count, modified_file_count, total_replacements = scan_directory(target_dir, dry_run)
|
||||
end_time = time.time()
|
||||
|
||||
# 显示总结
|
||||
print(f"\n{Colors.HEADER}{'='*60}{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}处理完成! 耗时: {end_time - start_time:.2f}秒{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}扫描了 {file_count} 个文件{Colors.ENDC}")
|
||||
print(f"{Colors.GREEN}修改了 {modified_file_count} 个文件中的 {total_replacements} 处图片格式{Colors.ENDC}")
|
||||
|
||||
if dry_run:
|
||||
print(f"\n{Colors.BLUE}这是预览模式,没有实际写入修改。{Colors.ENDC}")
|
||||
print(f"{Colors.BLUE}如需实际应用修改,请去掉 --dry-run 选项重新运行脚本。{Colors.ENDC}")
|
||||
print(f"{Colors.WARNING}无效选项,请重试{Colors.ENDC}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
268
scripts/doc_link_checker.py
Normal file
268
scripts/doc_link_checker.py
Normal file
@@ -0,0 +1,268 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import re
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Dict, Set
|
||||
import time
|
||||
import sys
|
||||
|
||||
# 颜色代码,用于终端输出
|
||||
class Colors:
|
||||
HEADER = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
WARNING = '\033[93m'
|
||||
FAIL = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
|
||||
def log_info(message):
|
||||
"""输出信息日志"""
|
||||
print(f"{Colors.BLUE}[INFO]{Colors.ENDC} {message}")
|
||||
|
||||
def log_warning(message):
|
||||
"""输出警告日志"""
|
||||
print(f"{Colors.WARNING}[WARNING]{Colors.ENDC} {message}")
|
||||
|
||||
def log_error(message):
|
||||
"""输出错误日志"""
|
||||
print(f"{Colors.FAIL}[ERROR]{Colors.ENDC} {message}")
|
||||
|
||||
def log_success(message):
|
||||
"""输出成功日志"""
|
||||
print(f"{Colors.GREEN}[SUCCESS]{Colors.ENDC} {message}")
|
||||
|
||||
def find_all_md_files(base_dir: str) -> List[Path]:
|
||||
"""查找指定目录下的所有 .md 和 .mdx 文件"""
|
||||
md_files = []
|
||||
base_path = Path(base_dir)
|
||||
|
||||
for ext in ["*.md", "*.mdx"]:
|
||||
md_files.extend(base_path.glob(f"**/{ext}"))
|
||||
|
||||
return md_files
|
||||
|
||||
def extract_links(file_content: str) -> List[Tuple[str, str, str]]:
|
||||
"""从文件内容中提取所有链接
|
||||
返回格式: [(完整匹配文本, 链接文本, 链接URL)]
|
||||
"""
|
||||
links = []
|
||||
|
||||
# 提取 Markdown 链接 [text](url)
|
||||
md_links = re.findall(r'\[(.*?)\]\((.*?)\)', file_content)
|
||||
for text, url in md_links:
|
||||
full_match = f"[{text}]({url})"
|
||||
links.append((full_match, text, url))
|
||||
|
||||
# 提取 <a> 标签链接
|
||||
a_links = re.findall(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)<\/a>', file_content)
|
||||
for url, text in a_links:
|
||||
full_match = f'<a href="{url}">{text}</a>'
|
||||
links.append((full_match, text, url))
|
||||
|
||||
# 提取 Mintlify Card 组件链接
|
||||
card_links = re.findall(r'<Card\s+title="([^"]*)"[^>]*\s+href="([^"]*)"[^>]*>(.*?)<\/Card>', file_content, re.DOTALL)
|
||||
for title, url, content in card_links:
|
||||
full_match = f'<Card title="{title}" href="{url}">{content}</Card>'
|
||||
links.append((full_match, title, url))
|
||||
|
||||
return links
|
||||
|
||||
def check_link_extensions(links: List[Tuple[str, str, str]],
|
||||
file_path: Path,
|
||||
all_files: Dict[str, Path],
|
||||
base_dir: Path) -> List[Tuple[str, str, str, str]]:
|
||||
"""检查链接是否包含不需要的扩展名
|
||||
返回格式: [(完整匹配文本, 链接文本, 原始URL, 修复后URL)]
|
||||
"""
|
||||
issues = []
|
||||
|
||||
for full_match, text, url in links:
|
||||
# 忽略外部链接和锚点链接
|
||||
if url.startswith(('http://', 'https://', '#', 'mailto:', 'tel:')):
|
||||
continue
|
||||
|
||||
# 忽略以 / 开头的绝对路径
|
||||
if url.startswith('/'):
|
||||
continue
|
||||
|
||||
# 检查链接是否包含 .md 或 .mdx 扩展名
|
||||
if url.endswith('.md') or url.endswith('.mdx'):
|
||||
# 计算修复后的 URL
|
||||
fixed_url = url.rsplit('.', 1)[0]
|
||||
issues.append((full_match, text, url, fixed_url))
|
||||
|
||||
return issues
|
||||
|
||||
def fix_links(file_path: Path, issues: List[Tuple[str, str, str, str]], dry_run: bool = True) -> bool:
|
||||
"""修复文件中的链接问题
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
issues: 需要修复的问题列表 [(完整匹配文本, 链接文本, 原始URL, 修复后URL)]
|
||||
dry_run: 如果为 True,只显示将要进行的修改,不实际修改文件
|
||||
|
||||
Returns:
|
||||
bool: 是否进行了修改
|
||||
"""
|
||||
if not issues:
|
||||
return False
|
||||
|
||||
# 读取文件内容
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
modified_content = content
|
||||
|
||||
# 遍历所有问题并修复
|
||||
for full_match, text, old_url, new_url in issues:
|
||||
if "Card" in full_match:
|
||||
# 修复 Card 组件链接
|
||||
old_pattern = f'href="{old_url}"'
|
||||
new_pattern = f'href="{new_url}"'
|
||||
modified_content = modified_content.replace(old_pattern, new_pattern)
|
||||
elif "<a" in full_match:
|
||||
# 修复 <a> 标签链接
|
||||
old_pattern = f'href="{old_url}"'
|
||||
new_pattern = f'href="{new_url}"'
|
||||
modified_content = modified_content.replace(old_pattern, new_pattern)
|
||||
else:
|
||||
# 修复 Markdown 链接
|
||||
old_pattern = f']({old_url})'
|
||||
new_pattern = f']({new_url})'
|
||||
modified_content = modified_content.replace(old_pattern, new_pattern)
|
||||
|
||||
# 如果内容有变化,写回文件
|
||||
if modified_content != content and not dry_run:
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(modified_content)
|
||||
return True
|
||||
|
||||
return not dry_run and modified_content != content
|
||||
|
||||
def process_file(file_path: Path, all_files: Dict[str, Path], base_dir: Path, args):
|
||||
"""处理单个文件中的链接问题"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
links = extract_links(content)
|
||||
issues = check_link_extensions(links, file_path, all_files, base_dir)
|
||||
|
||||
if issues:
|
||||
rel_path = file_path.relative_to(base_dir)
|
||||
print(f"\n{Colors.HEADER}{Colors.BOLD}检查文件: {rel_path}{Colors.ENDC}")
|
||||
|
||||
for i, (full_match, text, old_url, new_url) in enumerate(issues, 1):
|
||||
print(f" {i}. 发现问题: {Colors.WARNING}{old_url}{Colors.ENDC} -> {Colors.GREEN}{new_url}{Colors.ENDC}")
|
||||
|
||||
# 询问用户是否修复
|
||||
if not args.auto_fix:
|
||||
choice = input(f"\n{Colors.BOLD}修复这些问题? (y/n/a/q): {Colors.ENDC}")
|
||||
if choice.lower() == 'q': # q 代表退出脚本
|
||||
log_info("用户请求退出脚本")
|
||||
sys.exit(0)
|
||||
elif choice.lower() == 'a': # a 代表全部修复,并设置 auto_fix 标志
|
||||
args.auto_fix = True
|
||||
|
||||
if choice.lower() not in ('y', 'a'):
|
||||
log_info(f"跳过修复 {rel_path}")
|
||||
return False
|
||||
|
||||
# 修复问题
|
||||
fixed = fix_links(file_path, issues, dry_run=args.dry_run)
|
||||
|
||||
if args.dry_run:
|
||||
log_info(f"已检测到 {len(issues)} 个需要修复的链接 (模拟运行,实际未修改)")
|
||||
elif fixed:
|
||||
log_success(f"已修复 {len(issues)} 个链接问题")
|
||||
|
||||
# 如果不是自动修复模式,在每个文件处理完后暂停一下,让用户有时间查看结果
|
||||
if not args.auto_fix and fixed and not args.dry_run:
|
||||
input(f"\n{Colors.BOLD}已完成修复,按回车继续下一个文件...{Colors.ENDC}")
|
||||
|
||||
return fixed
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log_error(f"处理文件 {file_path} 时出错: {str(e)}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='检查并修复文档中的链接问题')
|
||||
parser.add_argument('doc_path', nargs='?', help='文档根目录路径')
|
||||
parser.add_argument('--dry-run', action='store_true', help='只显示将要修改的内容,不实际修改文件')
|
||||
parser.add_argument('--auto-fix', action='store_true', help='自动修复所有问题,不询问')
|
||||
args = parser.parse_args()
|
||||
|
||||
# 如果命令行未提供路径,则交互式询问
|
||||
if args.doc_path is None:
|
||||
doc_path = input(f"{Colors.BOLD}请输入文档根目录路径: {Colors.ENDC}")
|
||||
args.doc_path = doc_path.strip()
|
||||
|
||||
base_dir = Path(args.doc_path)
|
||||
|
||||
if not base_dir.exists() or not base_dir.is_dir():
|
||||
log_error(f"指定的目录 '{args.doc_path}' 不存在或不是一个目录")
|
||||
return 1
|
||||
|
||||
# 添加确认步骤
|
||||
print(f"\n{Colors.BOLD}将要扫描的目录:{Colors.ENDC} {Colors.GREEN}{base_dir}{Colors.ENDC}")
|
||||
if args.dry_run:
|
||||
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}仅检查,不修改文件{Colors.ENDC}")
|
||||
elif args.auto_fix:
|
||||
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}自动修复所有问题{Colors.ENDC}")
|
||||
else:
|
||||
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}交互式修复{Colors.ENDC}")
|
||||
|
||||
confirm = input(f"\n{Colors.BOLD}确认开始扫描? (y/n): {Colors.ENDC}")
|
||||
if confirm.lower() != 'y':
|
||||
log_info("操作已取消")
|
||||
return 0
|
||||
|
||||
log_info(f"开始扫描目录: {base_dir}")
|
||||
|
||||
# 查找所有文档文件
|
||||
all_files_list = find_all_md_files(base_dir)
|
||||
log_info(f"共找到 {len(all_files_list)} 个文档文件")
|
||||
|
||||
# 创建文件路径映射,用于链接验证
|
||||
all_files = {}
|
||||
for file_path in all_files_list:
|
||||
rel_path = file_path.relative_to(base_dir)
|
||||
all_files[str(rel_path)] = file_path
|
||||
|
||||
# 处理所有文件
|
||||
fixed_count = 0
|
||||
total_files = len(all_files_list)
|
||||
|
||||
try:
|
||||
for i, file_path in enumerate(all_files_list, 1):
|
||||
# 清空当前行并显示进度
|
||||
sys.stdout.write("\r" + " " * 80) # 清空当前行
|
||||
sys.stdout.write(f"\r{Colors.BOLD}进度: {i}/{total_files} ({i/total_files*100:.1f}%){Colors.ENDC}")
|
||||
sys.stdout.flush()
|
||||
|
||||
# 处理文件,如果有修复则增加计数
|
||||
if process_file(file_path, all_files, base_dir, args):
|
||||
fixed_count += 1
|
||||
except KeyboardInterrupt:
|
||||
print("\n")
|
||||
log_warning("用户中断了处理过程")
|
||||
# 继续执行后面的代码,显示已完成的统计信息
|
||||
|
||||
print("\n")
|
||||
log_info(f"扫描完成,共处理 {total_files} 个文件")
|
||||
|
||||
if args.dry_run:
|
||||
log_info(f"发现 {fixed_count} 个文件中有链接问题需要修复")
|
||||
else:
|
||||
log_success(f"已修复 {fixed_count} 个文件中的链接问题")
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
588
scripts/doc_migration_helper.py
Normal file
588
scripts/doc_migration_helper.py
Normal file
@@ -0,0 +1,588 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
文档迁移助手
|
||||
|
||||
这个脚本用于辅助 gitbook 文档(dify-docs)迁移至 mintlify(dify-docs-mintlify)
|
||||
主要功能包括:
|
||||
1. 图片路径替换:从原始文档查找并替换为在线图片链接
|
||||
2. 文档引用路径替换:将相对路径替换为绝对路径
|
||||
3. 支持交互式确认每个修改
|
||||
|
||||
使用方法:
|
||||
python doc_migration_helper.py <目标文件路径>
|
||||
例如:
|
||||
python doc_migration_helper.py /Users/allen/Documents/dify-docs-mintlify/zh-hans/guides/workflow/nodes/parameter-extractor.mdx
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# ANSI 颜色代码
|
||||
class Colors:
|
||||
HEADER = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
CYAN = '\033[96m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
|
||||
class DocMigrationHelper:
|
||||
def __init__(self, target_file, source_dir="/Users/allen/Documents/dify-docs",
|
||||
mintlify_dir="/Users/allen/Documents/dify-docs-mintlify"):
|
||||
"""
|
||||
初始化文档迁移助手
|
||||
|
||||
Args:
|
||||
target_file: 要处理的目标文件路径
|
||||
source_dir: 源文档目录路径
|
||||
mintlify_dir: mintlify文档目录路径
|
||||
"""
|
||||
self.target_file = target_file
|
||||
self.source_dir = source_dir
|
||||
self.mintlify_dir = mintlify_dir
|
||||
|
||||
# 获取docs.json内容用于路径映射
|
||||
self.docs_config = self._load_docs_config()
|
||||
|
||||
# 解析目标文件的相对路径
|
||||
self.rel_path = os.path.relpath(target_file, mintlify_dir)
|
||||
|
||||
# 推断对应的源文件路径
|
||||
self.source_file = self._infer_source_file_path()
|
||||
|
||||
# 图片映射缓存
|
||||
self.image_url_cache = {}
|
||||
|
||||
def _load_docs_config(self):
|
||||
"""加载docs.json配置文件"""
|
||||
try:
|
||||
docs_config_path = os.path.join(self.mintlify_dir, "docs.json")
|
||||
with open(docs_config_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}无法加载docs.json: {e}{Colors.ENDC}")
|
||||
return {}
|
||||
|
||||
def _infer_source_file_path(self):
|
||||
"""推断源文件路径"""
|
||||
# 从mintlify路径推断原始文档中对应的路径
|
||||
parts = self.rel_path.split(os.sep)
|
||||
|
||||
# 处理语言差异 (zh-hans -> zh_CN)
|
||||
if parts[0] == "zh-hans":
|
||||
lang_dir = "zh_CN"
|
||||
elif parts[0] == "en":
|
||||
lang_dir = "en_US"
|
||||
else:
|
||||
lang_dir = parts[0]
|
||||
|
||||
# 实际目标文件名称
|
||||
target_basename = os.path.basename(self.target_file)
|
||||
if target_basename.endswith(".mdx"):
|
||||
target_basename = target_basename[:-4]
|
||||
|
||||
# 收集可能的路径
|
||||
potential_paths = []
|
||||
|
||||
# 处理文件扩展名 (.mdx -> .md)
|
||||
rest_path = os.path.join(*parts[1:])
|
||||
if rest_path.endswith(".mdx"):
|
||||
rest_path = rest_path[:-4] + ".md"
|
||||
|
||||
# 1. 直接对应路径
|
||||
direct_path = os.path.join(self.source_dir, lang_dir, rest_path)
|
||||
potential_paths.append(direct_path)
|
||||
|
||||
# 2. 处理节点路径差异 (nodes -> node)
|
||||
node_path = direct_path.replace("nodes", "node")
|
||||
if node_path != direct_path:
|
||||
potential_paths.append(node_path)
|
||||
|
||||
# 3. 可能添加了 guides 前缀
|
||||
guides_path = os.path.join(self.source_dir, lang_dir, "guides", rest_path)
|
||||
if guides_path != direct_path:
|
||||
potential_paths.append(guides_path)
|
||||
# 也考虑 guides 和 node 的组合
|
||||
guides_node_path = guides_path.replace("nodes", "node")
|
||||
if guides_node_path != guides_path:
|
||||
potential_paths.append(guides_node_path)
|
||||
|
||||
# 4. 如果是工作流节点文件,尝试特定目录
|
||||
if "workflow" in rest_path and "nodes" in rest_path:
|
||||
workflow_node_path = os.path.join(self.source_dir, lang_dir, "guides", "workflow", "node", target_basename + ".md")
|
||||
potential_paths.append(workflow_node_path)
|
||||
|
||||
# 先检查所有可能的直接匹配路径
|
||||
for path in potential_paths:
|
||||
if os.path.exists(path):
|
||||
print(f"{Colors.GREEN}找到源文件: {path}{Colors.ENDC}")
|
||||
return path
|
||||
|
||||
# 如果上面的匹配都失败,尝试一些含有繁体/简体变体的目录
|
||||
if "workflow" in rest_path and "nodes" in rest_path:
|
||||
# 尝试搜索node目录
|
||||
node_dir = os.path.join(self.source_dir, lang_dir, "guides", "workflow", "node")
|
||||
if os.path.exists(node_dir):
|
||||
# 对比文件名,考虑字符替换(如 - 和 _)
|
||||
target_name_variants = [
|
||||
target_basename,
|
||||
target_basename.replace("-", "_"),
|
||||
target_basename.replace("_", "-")
|
||||
]
|
||||
|
||||
for file in os.listdir(node_dir):
|
||||
if file.endswith(".md"):
|
||||
file_basename = os.path.splitext(file)[0]
|
||||
# 尝试各种变体
|
||||
for variant in target_name_variants:
|
||||
if file_basename == variant:
|
||||
found_path = os.path.join(node_dir, file)
|
||||
print(f"{Colors.GREEN}找到匹配的源文件: {found_path}{Colors.ENDC}")
|
||||
return found_path
|
||||
|
||||
# 如果仍然找不到,尝试搜索整个文档目录
|
||||
print(f"{Colors.YELLOW}尝试搜索整个文档目录...{Colors.ENDC}")
|
||||
found_files = []
|
||||
|
||||
for root, _, files in os.walk(os.path.join(self.source_dir, lang_dir)):
|
||||
for file in files:
|
||||
if file.endswith(".md"):
|
||||
file_basename = os.path.splitext(file)[0]
|
||||
# 比较文件名的各种变体
|
||||
if (file_basename == target_basename or
|
||||
file_basename == target_basename.replace("-", "_") or
|
||||
file_basename == target_basename.replace("_", "-")):
|
||||
found_files.append(os.path.join(root, file))
|
||||
|
||||
if found_files:
|
||||
# 如果找到多个文件,选择路径最相似的
|
||||
if len(found_files) > 1:
|
||||
best_match = None
|
||||
best_score = -1
|
||||
current_parts = rest_path.split(os.sep)
|
||||
|
||||
for file_path in found_files:
|
||||
rel_path = os.path.relpath(file_path, self.source_dir)
|
||||
rel_parts = rel_path.split(os.sep)
|
||||
# 计算路径部分的重叠数量
|
||||
score = sum(1 for a, b in zip(current_parts, rel_parts) if a == b or a.replace("nodes", "node") == b)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = file_path
|
||||
|
||||
print(f"{Colors.GREEN}找到最匹配的源文件: {best_match}{Colors.ENDC}")
|
||||
return best_match
|
||||
else:
|
||||
print(f"{Colors.GREEN}找到源文件: {found_files[0]}{Colors.ENDC}")
|
||||
return found_files[0]
|
||||
|
||||
print(f"{Colors.YELLOW}无法找到对应的源文件{Colors.ENDC}")
|
||||
return None
|
||||
|
||||
def get_corresponding_image_url(self, local_path):
|
||||
"""
|
||||
根据本地图片路径找到对应的在线URL
|
||||
|
||||
Args:
|
||||
local_path: 本地图片路径,例如 /zh-cn/user-guide/.gitbook/assets/image (66).png
|
||||
|
||||
Returns:
|
||||
online_url: 在线图片URL
|
||||
"""
|
||||
# 如果已经缓存过,直接返回
|
||||
if local_path in self.image_url_cache:
|
||||
return self.image_url_cache[local_path]
|
||||
|
||||
# 获取本地图片文件名和图片序号
|
||||
local_img_name = os.path.basename(local_path)
|
||||
img_number_match = re.search(r'\((\d+)\)', local_img_name)
|
||||
img_number = img_number_match.group(1) if img_number_match else None
|
||||
|
||||
# 直接尝试根据目标文件路径推断对应的源文件
|
||||
if not self.source_file:
|
||||
print(f"{Colors.YELLOW}无法找到对应的源文件,尝试查找相关文件...{Colors.ENDC}")
|
||||
# 尝试从目标文件名推断源文件名
|
||||
target_basename = os.path.basename(self.target_file).replace('.mdx', '')
|
||||
|
||||
# 构建可能的源文件路径
|
||||
parts = self.rel_path.split(os.sep)
|
||||
if parts[0] == "zh-hans":
|
||||
lang_dir = "zh_CN"
|
||||
elif parts[0] == "en":
|
||||
lang_dir = "en_US"
|
||||
else:
|
||||
lang_dir = parts[0]
|
||||
|
||||
# 尝试在guides/workflow/node目录下查找
|
||||
possible_source_dir = os.path.join(self.source_dir, lang_dir, "guides", "workflow", "node")
|
||||
if os.path.exists(possible_source_dir):
|
||||
for file in os.listdir(possible_source_dir):
|
||||
if file.endswith(".md") and file.startswith(target_basename.replace("-", "_")):
|
||||
self.source_file = os.path.join(possible_source_dir, file)
|
||||
print(f"{Colors.GREEN}找到可能的源文件: {self.source_file}{Colors.ENDC}")
|
||||
break
|
||||
|
||||
# 如果找不到源文件,尝试在整个文档中搜索图片
|
||||
if not self.source_file or not os.path.exists(self.source_file):
|
||||
print(f"{Colors.YELLOW}尝试在整个文档中搜索图片...{Colors.ENDC}")
|
||||
# 搜索整个源目录中的所有.md文件
|
||||
all_md_files = []
|
||||
for root, _, files in os.walk(os.path.join(self.source_dir, "zh_CN")):
|
||||
for file in files:
|
||||
if file.endswith(".md"):
|
||||
all_md_files.append(os.path.join(root, file))
|
||||
|
||||
# 在所有文件中搜索图片URL
|
||||
for md_file in all_md_files:
|
||||
try:
|
||||
with open(md_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 查找图片序号匹配
|
||||
if img_number:
|
||||
# 查找包含特定序号的图片
|
||||
url_matches = re.findall(r'!\[.*?\]\((https://assets-docs\.dify\.ai/[^)]+)\)', content)
|
||||
for url in url_matches:
|
||||
# 如果URL包含图片名字的关键部分,可能是匹配项
|
||||
if url.endswith(".png") or url.endswith(".jpg") or url.endswith(".jpeg") or url.endswith(".gif"):
|
||||
self.image_url_cache[local_path] = url
|
||||
print(f"{Colors.GREEN}在文件 {md_file} 中找到可能匹配的图片URL: {url}{Colors.ENDC}")
|
||||
return url
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# 如果找不到,返回构造的URL
|
||||
# 默认路径构造
|
||||
if parts[0] == "zh-hans":
|
||||
constructed_url = f"https://assets-docs.dify.ai/dify-enterprise-mintlify/zh_CN/guides/workflow/node/{img_number}.png"
|
||||
print(f"{Colors.YELLOW}未找到匹配图片,构造URL: {constructed_url}{Colors.ENDC}")
|
||||
return constructed_url
|
||||
return None
|
||||
|
||||
# 读取源文件内容
|
||||
try:
|
||||
with open(self.source_file, 'r', encoding='utf-8') as f:
|
||||
source_content = f.read()
|
||||
|
||||
# 在源文件中查找图片链接
|
||||
online_urls = re.findall(r'!\[.*?\]\((https://assets-docs\.dify\.ai/[^)]+)\)', source_content)
|
||||
|
||||
# 首先尝试基于图片序号匹配
|
||||
if img_number:
|
||||
for url in online_urls:
|
||||
# 检查URL是否包含相同序号或相似模式
|
||||
if f"{img_number}.png" in url or f"{img_number})" in url:
|
||||
self.image_url_cache[local_path] = url
|
||||
return url
|
||||
|
||||
# 然后尝试文件名匹配
|
||||
for url in online_urls:
|
||||
url_basename = os.path.basename(url)
|
||||
# 精确匹配
|
||||
if url_basename == local_img_name:
|
||||
self.image_url_cache[local_path] = url
|
||||
return url
|
||||
|
||||
# 尝试模糊匹配(移除数字和特殊字符后比较)
|
||||
clean_local = re.sub(r'[^a-zA-Z]', '', local_img_name)
|
||||
clean_url = re.sub(r'[^a-zA-Z]', '', url_basename)
|
||||
|
||||
if clean_local and clean_url and clean_local == clean_url:
|
||||
self.image_url_cache[local_path] = url
|
||||
return url
|
||||
|
||||
# 查找Frame组件中的图片
|
||||
frame_urls = re.findall(r'<Frame[^>]*>.*?<img[^>]*src="(https://assets-docs\.dify\.ai/[^"]+)".*?</Frame>',
|
||||
source_content, re.DOTALL)
|
||||
|
||||
for url in frame_urls:
|
||||
url_basename = os.path.basename(url)
|
||||
if url_basename == local_img_name or re.sub(r'[^a-zA-Z]', '', url_basename) == re.sub(r'[^a-zA-Z]', '', local_img_name):
|
||||
self.image_url_cache[local_path] = url
|
||||
return url
|
||||
|
||||
# 如果在源文件中找不到匹配的URL,尝试在相关文件中查找
|
||||
related_files = []
|
||||
source_dir = os.path.dirname(self.source_file)
|
||||
for file in os.listdir(source_dir):
|
||||
if file.endswith(".md") and file != os.path.basename(self.source_file):
|
||||
related_files.append(os.path.join(source_dir, file))
|
||||
|
||||
for related_file in related_files:
|
||||
try:
|
||||
with open(related_file, 'r', encoding='utf-8') as f:
|
||||
related_content = f.read()
|
||||
|
||||
related_urls = re.findall(r'!\[.*?\]\((https://assets-docs\.dify\.ai/[^)]+)\)', related_content)
|
||||
for url in related_urls:
|
||||
if img_number and (f"{img_number}.png" in url or f"{img_number})" in url):
|
||||
self.image_url_cache[local_path] = url
|
||||
print(f"{Colors.GREEN}在相关文件 {related_file} 中找到匹配图片: {url}{Colors.ENDC}")
|
||||
return url
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# 最后尝试根据目录结构构造URL
|
||||
relative_source_path = os.path.relpath(self.source_file, self.source_dir)
|
||||
dir_parts = os.path.dirname(relative_source_path).split(os.sep)
|
||||
|
||||
if img_number and len(dir_parts) >= 2:
|
||||
# 使用目录结构构造可能的URL
|
||||
if dir_parts[0] == "zh_CN":
|
||||
constructed_url = f"https://assets-docs.dify.ai/dify-enterprise-mintlify/{dir_parts[0]}/{'/'.join(dir_parts[1:])}/{img_number}.png"
|
||||
print(f"{Colors.YELLOW}未找到匹配图片,构造URL: {constructed_url}{Colors.ENDC}")
|
||||
return constructed_url
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}读取源文件时出错: {e}{Colors.ENDC}")
|
||||
return None
|
||||
|
||||
def get_absolute_doc_path(self, relative_path):
|
||||
"""
|
||||
将相对文档路径转换为绝对路径
|
||||
|
||||
Args:
|
||||
relative_path: 相对路径,例如 ./iteration.md 或 http-request.md
|
||||
|
||||
Returns:
|
||||
absolute_path: 绝对路径,例如 /zh-hans/guides/workflow/nodes/iteration
|
||||
"""
|
||||
# 如果已经是绝对路径,直接返回
|
||||
if relative_path.startswith('/'):
|
||||
return relative_path
|
||||
|
||||
# 如果是外部链接,直接返回
|
||||
if relative_path.startswith(('http://', 'https://')):
|
||||
return relative_path
|
||||
|
||||
# 提取锁点信息(如果有的话)
|
||||
fragment = ""
|
||||
if '#' in relative_path:
|
||||
relative_path, fragment = relative_path.split('#', 1)
|
||||
fragment = f'#{fragment}'
|
||||
|
||||
# 移除.md或.mdx扩展名
|
||||
if relative_path.endswith(('.md', '.mdx')):
|
||||
extension = '.md' if relative_path.endswith('.md') else '.mdx'
|
||||
relative_path = relative_path[:-len(extension)]
|
||||
|
||||
# 获取当前文件的语言前缀(例如 zh-hans)
|
||||
lang_prefix = self.rel_path.split(os.sep)[0]
|
||||
|
||||
# 处理相对路径
|
||||
current_dir = os.path.dirname(self.rel_path)
|
||||
current_dir_parts = current_dir.split(os.sep)
|
||||
|
||||
# 根据不同类型的相对路径进行处理
|
||||
if relative_path.startswith('./'):
|
||||
# ./file.md 形式
|
||||
relative_path = relative_path[2:]
|
||||
full_path = os.path.normpath(os.path.join(current_dir, relative_path))
|
||||
elif relative_path.startswith('../'):
|
||||
# ../file.md 形式
|
||||
full_path = os.path.normpath(os.path.join(current_dir, relative_path))
|
||||
else:
|
||||
# 简单名称 file.md 形式
|
||||
# 首先检查是否在同一目录下
|
||||
basename = os.path.basename(relative_path)
|
||||
same_level_path = os.path.normpath(os.path.join(current_dir, basename))
|
||||
|
||||
# 检查实际文件是否存在
|
||||
if os.path.exists(os.path.join(self.mintlify_dir, same_level_path + '.mdx')):
|
||||
full_path = same_level_path
|
||||
else:
|
||||
# 如果是节点文件,通常在 /nodes/ 目录下
|
||||
# 查找是否在当前语言的 workflow/nodes 目录下
|
||||
if "workflow" in current_dir and ("node" in current_dir or "nodes" in current_dir):
|
||||
# 构造可能的节点路径
|
||||
possible_path = f"{lang_prefix}/guides/workflow/nodes/{basename}"
|
||||
if os.path.exists(os.path.join(self.mintlify_dir, possible_path + '.mdx')):
|
||||
full_path = possible_path
|
||||
else:
|
||||
# 如果不存在,使用默认的同级目录路径
|
||||
full_path = same_level_path
|
||||
print(f"{Colors.YELLOW}警告: 无法找到文件 {possible_path}.mdx,使用默认路径{Colors.ENDC}")
|
||||
else:
|
||||
# 尝试搜索整个 mintlify 目录
|
||||
matches = []
|
||||
for root, _, files in os.walk(os.path.join(self.mintlify_dir, lang_prefix)):
|
||||
for file in files:
|
||||
if file == f"{basename}.mdx" or file == f"{basename}.md":
|
||||
rel_file_path = os.path.relpath(os.path.join(root, file), self.mintlify_dir)
|
||||
# 移除扩展名
|
||||
rel_file_path = os.path.splitext(rel_file_path)[0]
|
||||
matches.append(rel_file_path)
|
||||
|
||||
if matches:
|
||||
# 如果找到多个匹配,选择与当前目录最相似的
|
||||
if len(matches) > 1:
|
||||
best_match = None
|
||||
best_score = -1
|
||||
|
||||
for match in matches:
|
||||
match_parts = match.split(os.sep)
|
||||
# 计算路径部分的重叠数量
|
||||
score = sum(1 for a, b in zip(current_dir_parts, match_parts[1:]) if a == b)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = match
|
||||
|
||||
full_path = best_match
|
||||
else:
|
||||
full_path = matches[0]
|
||||
else:
|
||||
# 如果找不到匹配的文件,使用默认的同级目录路径
|
||||
full_path = same_level_path
|
||||
print(f"{Colors.YELLOW}警告: 无法找到文件 {basename}.mdx,使用默认路径{Colors.ENDC}")
|
||||
|
||||
# 确保路径以 / 开头
|
||||
if not full_path.startswith('/'):
|
||||
full_path = '/' + full_path
|
||||
|
||||
# 添加锁点(如果有的话)
|
||||
return full_path + fragment
|
||||
|
||||
def process_file(self):
|
||||
"""处理文件,替换图片路径和文档引用路径"""
|
||||
try:
|
||||
# 读取目标文件内容
|
||||
with open(self.target_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 存储修改项
|
||||
changes = []
|
||||
|
||||
# 1. 查找并替换Markdown格式图片
|
||||
# 
|
||||
md_img_pattern = re.compile(r'!\[([^\]]*)\]\((/[^)]+)\)')
|
||||
for match in md_img_pattern.finditer(content):
|
||||
alt_text = match.group(1)
|
||||
local_path = match.group(2)
|
||||
full_match = match.group(0)
|
||||
|
||||
# 获取对应的在线URL
|
||||
online_url = self.get_corresponding_image_url(local_path)
|
||||
if online_url:
|
||||
new_text = f''
|
||||
changes.append((full_match, new_text, '图片链接'))
|
||||
|
||||
# 2. 查找并替换Frame组件中的图片
|
||||
frame_img_pattern = re.compile(r'(<Frame[^>]*>[\s\S]*?<img[^>]*src=")(/[^"]+)("[^>]*>[\s\S]*?</Frame>)')
|
||||
for match in frame_img_pattern.finditer(content):
|
||||
prefix = match.group(1)
|
||||
local_path = match.group(2)
|
||||
suffix = match.group(3)
|
||||
full_match = match.group(0)
|
||||
|
||||
# 获取对应的在线URL
|
||||
online_url = self.get_corresponding_image_url(local_path)
|
||||
if online_url:
|
||||
new_text = f'{prefix}{online_url}{suffix}'
|
||||
changes.append((full_match, new_text, 'Frame组件图片'))
|
||||
|
||||
# 3. 查找并替换文档引用链接
|
||||
# [link text](./path/to/file.md) 或 [link text](path/to/file.md)
|
||||
doc_link_pattern = re.compile(r'\[([^\]]+)\]\((\./[^)]+\.md(?:#[^)]*)?|\.\./[^)]+\.md(?:#[^)]*)?|[^)]+\.md(?:#[^)]*)?)\)')
|
||||
for match in doc_link_pattern.finditer(content):
|
||||
link_text = match.group(1)
|
||||
rel_path = match.group(2)
|
||||
full_match = match.group(0)
|
||||
|
||||
# 检查是否包含锚点
|
||||
fragment = ""
|
||||
if '#' in rel_path:
|
||||
rel_path, fragment = rel_path.split('#', 1)
|
||||
fragment = f'#{fragment}'
|
||||
|
||||
# 获取绝对路径
|
||||
abs_path = self.get_absolute_doc_path(rel_path)
|
||||
if abs_path:
|
||||
new_text = f'[{link_text}]({abs_path}{fragment})'
|
||||
changes.append((full_match, new_text, '文档链接'))
|
||||
|
||||
# 如果没有需要修改的内容
|
||||
if not changes:
|
||||
print(f"{Colors.GREEN}文件不需要修改{Colors.ENDC}")
|
||||
return True
|
||||
|
||||
# 显示找到的修改项
|
||||
print(f"\n{Colors.BLUE}找到 {len(changes)} 个需要修改的内容:{Colors.ENDC}")
|
||||
for i, (old, new, change_type) in enumerate(changes):
|
||||
print(f"{Colors.CYAN}修改 {i+1} ({change_type}):{Colors.ENDC}")
|
||||
print(f" - 原始内容: {Colors.YELLOW}{old[:100]}{'...' if len(old) > 100 else ''}{Colors.ENDC}")
|
||||
print(f" - 新内容: {Colors.GREEN}{new[:100]}{'...' if len(new) > 100 else ''}{Colors.ENDC}")
|
||||
print()
|
||||
|
||||
# 询问是否执行修改
|
||||
selected_changes = []
|
||||
response = input(f"{Colors.BOLD}是否应用这些修改? (y/n/部分修改输入数字如1,3,5): {Colors.ENDC}")
|
||||
|
||||
if response.lower() == 'n':
|
||||
print(f"{Colors.BLUE}已取消修改{Colors.ENDC}")
|
||||
return False
|
||||
elif response.lower() == 'y':
|
||||
selected_changes = changes
|
||||
else:
|
||||
try:
|
||||
# 解析用户选择的修改索引
|
||||
indices = [int(i.strip()) - 1 for i in response.split(',')]
|
||||
selected_changes = [changes[i] for i in indices if 0 <= i < len(changes)]
|
||||
if not selected_changes:
|
||||
print(f"{Colors.YELLOW}未选择任何有效修改,操作取消{Colors.ENDC}")
|
||||
return False
|
||||
except:
|
||||
print(f"{Colors.YELLOW}输入格式有误,操作取消{Colors.ENDC}")
|
||||
return False
|
||||
|
||||
# 应用修改
|
||||
modified_content = content
|
||||
for old, new, _ in selected_changes:
|
||||
modified_content = modified_content.replace(old, new)
|
||||
|
||||
# 写入文件
|
||||
with open(self.target_file, 'w', encoding='utf-8') as f:
|
||||
f.write(modified_content)
|
||||
|
||||
print(f"{Colors.GREEN}成功应用 {len(selected_changes)} 个修改到文件{Colors.ENDC}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}处理文件时出错: {e}{Colors.ENDC}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
# 检查命令行参数
|
||||
if len(sys.argv) != 2:
|
||||
print(f"用法: {sys.argv[0]} <目标文件路径>")
|
||||
print(f"例如: {sys.argv[0]} /Users/allen/Documents/dify-docs-mintlify/zh-hans/guides/workflow/nodes/parameter-extractor.mdx")
|
||||
return
|
||||
|
||||
# 获取目标文件路径
|
||||
target_file = sys.argv[1]
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.isfile(target_file):
|
||||
print(f"{Colors.RED}文件不存在: {target_file}{Colors.ENDC}")
|
||||
return
|
||||
|
||||
# 初始化并处理文件
|
||||
helper = DocMigrationHelper(target_file)
|
||||
|
||||
print(f"{Colors.HEADER}开始处理文件: {target_file}{Colors.ENDC}")
|
||||
print(f"对应的源文件: {helper.source_file or '未找到'}")
|
||||
|
||||
helper.process_file()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
365
scripts/fix_links_interactive-3.25-backup.py
Normal file
365
scripts/fix_links_interactive-3.25-backup.py
Normal file
@@ -0,0 +1,365 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
交互式Markdown链接修复工具
|
||||
|
||||
这个脚本用于交互式地修复Markdown文件中的相对路径引用,将它们转换为
|
||||
从根目录开始的绝对路径格式(如 /zh-hans/xxx),以符合Mintlify文档要求。
|
||||
脚本支持处理单个文件或指定目录内的所有.mdx文件。
|
||||
|
||||
特点:
|
||||
- 交互式操作,精确可控
|
||||
- 提供修改预览
|
||||
- 支持单文件或目录处理
|
||||
- 将相对路径转换为绝对路径
|
||||
- 支持锚点保留
|
||||
- 移除文件扩展名
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import glob
|
||||
|
||||
# 正则表达式来匹配Markdown链接引用,支持.md和.mdx文件
|
||||
MD_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)]+\.(md|mdx))(?:#([^)]*))?(\))')
|
||||
REL_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)/][^)]+)(?:#([^)]*))?(\))') # 匹配不以/开头的相对路径
|
||||
|
||||
# 颜色代码,用于美化终端输出
|
||||
class Colors:
|
||||
HEADER = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
CYAN = '\033[96m'
|
||||
GREEN = '\033[92m'
|
||||
WARNING = '\033[93m'
|
||||
FAIL = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
|
||||
|
||||
def find_file_in_project(root_dir, rel_path, current_file_dir):
|
||||
"""
|
||||
根据相对路径在项目中查找实际文件
|
||||
|
||||
Args:
|
||||
root_dir: 项目根目录
|
||||
rel_path: 相对路径引用
|
||||
current_file_dir: 当前文件所在目录
|
||||
|
||||
Returns:
|
||||
找到的文件绝对路径,或None如果未找到
|
||||
"""
|
||||
# 移除扩展名,稍后会添加回.mdx
|
||||
if rel_path.endswith(('.md', '.mdx')):
|
||||
extension = '.md' if rel_path.endswith('.md') else '.mdx'
|
||||
rel_path = rel_path[:-len(extension)]
|
||||
|
||||
# 如果是以../或./开头的相对路径
|
||||
if rel_path.startswith(('./','../')):
|
||||
# 计算实际路径
|
||||
actual_path = os.path.normpath(os.path.join(current_file_dir, rel_path))
|
||||
|
||||
# 尝试匹配.mdx文件
|
||||
matches = glob.glob(f"{actual_path}.mdx")
|
||||
if matches:
|
||||
return matches[0]
|
||||
|
||||
# 尝试匹配.md文件
|
||||
matches = glob.glob(f"{actual_path}.md")
|
||||
if matches:
|
||||
return matches[0]
|
||||
|
||||
# 尝试在项目中搜索匹配的文件名
|
||||
basename = os.path.basename(rel_path)
|
||||
# 搜索所有.mdx文件
|
||||
mdx_matches = []
|
||||
md_matches = []
|
||||
|
||||
for root, _, files in os.walk(root_dir):
|
||||
for file in files:
|
||||
if file.endswith('.mdx') and os.path.splitext(file)[0] == basename:
|
||||
mdx_matches.append(os.path.join(root, file))
|
||||
elif file.endswith('.md') and os.path.splitext(file)[0] == basename:
|
||||
md_matches.append(os.path.join(root, file))
|
||||
|
||||
# 优先使用.mdx文件
|
||||
if mdx_matches:
|
||||
return mdx_matches[0]
|
||||
elif md_matches:
|
||||
return md_matches[0]
|
||||
|
||||
return None
|
||||
|
||||
def get_absolute_path(file_path, root_dir):
|
||||
"""
|
||||
获取相对于项目根目录的绝对路径
|
||||
|
||||
Args:
|
||||
file_path: 文件的完整路径
|
||||
root_dir: 项目根目录
|
||||
|
||||
Returns:
|
||||
/zh-hans/xxx 格式的绝对路径
|
||||
"""
|
||||
# 获取相对于根目录的路径
|
||||
rel_path = os.path.relpath(file_path, root_dir)
|
||||
# 移除扩展名
|
||||
rel_path = os.path.splitext(rel_path)[0]
|
||||
# 添加前导斜杠
|
||||
abs_path = f"/{rel_path}"
|
||||
|
||||
return abs_path
|
||||
|
||||
def process_file(file_path, root_dir, dry_run=False, auto_confirm=False):
|
||||
"""
|
||||
处理单个Markdown文件中的链接引用
|
||||
|
||||
Args:
|
||||
file_path: 要处理的文件路径
|
||||
root_dir: 项目根目录
|
||||
dry_run: 是否只预览修改,不实际写入
|
||||
auto_confirm: 是否自动确认所有修改
|
||||
|
||||
Returns:
|
||||
修改的链接数量
|
||||
"""
|
||||
print(f"\n{Colors.HEADER}处理文件:{Colors.ENDC} {file_path}")
|
||||
|
||||
# 获取文件内容
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
print(f"{Colors.FAIL}错误: 无法读取文件 - {e}{Colors.ENDC}")
|
||||
return 0
|
||||
|
||||
# 当前文件所在目录
|
||||
current_file_dir = os.path.dirname(file_path)
|
||||
|
||||
# 存储所有要修改的内容
|
||||
changes = []
|
||||
|
||||
# 查找带有.md或.mdx后缀的链接
|
||||
for m in MD_LINK_PATTERN.finditer(content):
|
||||
link_text = m.group(1)
|
||||
link_path = m.group(2)
|
||||
fragment = m.group(4) or "" # 锚点可能不存在
|
||||
full_match = m.group(0)
|
||||
|
||||
# 跳过外部链接
|
||||
if link_path.startswith(('http://', 'https://', 'mailto:', 'ftp://')):
|
||||
continue
|
||||
|
||||
# 查找实际文件
|
||||
actual_file = find_file_in_project(root_dir, link_path, current_file_dir)
|
||||
if actual_file:
|
||||
# 转换为绝对路径
|
||||
abs_path = get_absolute_path(actual_file, root_dir)
|
||||
fragment_text = f"#{fragment}" if fragment else ""
|
||||
new_link = f"[{link_text}]({abs_path}{fragment_text})"
|
||||
changes.append((full_match, new_link, actual_file))
|
||||
|
||||
# 查找其他相对路径链接(不带.md或.mdx后缀)
|
||||
for m in REL_LINK_PATTERN.finditer(content):
|
||||
link_text = m.group(1)
|
||||
link_path = m.group(2)
|
||||
fragment = m.group(3) or "" # 锚点可能不存在
|
||||
full_match = m.group(0)
|
||||
|
||||
# 跳过已经是绝对路径的链接
|
||||
if link_path.startswith('/'):
|
||||
continue
|
||||
|
||||
# 跳过外部链接
|
||||
if link_path.startswith(('http://', 'https://', 'mailto:', 'ftp://')):
|
||||
continue
|
||||
|
||||
# 查找实际文件
|
||||
actual_file = find_file_in_project(root_dir, link_path, current_file_dir)
|
||||
if actual_file:
|
||||
# 转换为绝对路径
|
||||
abs_path = get_absolute_path(actual_file, root_dir)
|
||||
fragment_text = f"#{fragment}" if fragment else ""
|
||||
new_link = f"[{link_text}]({abs_path}{fragment_text})"
|
||||
changes.append((full_match, new_link, actual_file))
|
||||
|
||||
# 如果没有找到需要修改的链接
|
||||
if not changes:
|
||||
print(f"{Colors.GREEN}没有找到需要修改的链接{Colors.ENDC}")
|
||||
return 0
|
||||
|
||||
# 显示找到的修改
|
||||
print(f"\n{Colors.BLUE}找到 {len(changes)} 个需要修改的链接:{Colors.ENDC}")
|
||||
for i, (old, new, target) in enumerate(changes):
|
||||
print(f"{Colors.CYAN}修改 {i+1}:{Colors.ENDC}")
|
||||
print(f" - 原始链接: {Colors.WARNING}{old}{Colors.ENDC}")
|
||||
print(f" - 新链接: {Colors.GREEN}{new}{Colors.ENDC}")
|
||||
print(f" - 目标文件: {os.path.relpath(target, root_dir)}\n")
|
||||
|
||||
# 如果是预览模式,返回
|
||||
if dry_run:
|
||||
print(f"{Colors.BLUE}预览模式 - 未执行实际修改{Colors.ENDC}")
|
||||
return len(changes)
|
||||
|
||||
# 确认修改
|
||||
if not auto_confirm:
|
||||
response = input(f"{Colors.BOLD}是否应用这些修改? (y/n/部分修改输入数字如1,3,5): {Colors.ENDC}")
|
||||
|
||||
if response.lower() == 'n':
|
||||
print(f"{Colors.BLUE}已取消修改{Colors.ENDC}")
|
||||
return 0
|
||||
elif response.lower() == 'y':
|
||||
selected_changes = changes
|
||||
else:
|
||||
try:
|
||||
# 解析用户选择的修改索引
|
||||
indices = [int(i.strip()) - 1 for i in response.split(',')]
|
||||
selected_changes = [changes[i] for i in indices if 0 <= i < len(changes)]
|
||||
if not selected_changes:
|
||||
print(f"{Colors.WARNING}未选择任何有效修改,操作取消{Colors.ENDC}")
|
||||
return 0
|
||||
except:
|
||||
print(f"{Colors.WARNING}输入格式有误,操作取消{Colors.ENDC}")
|
||||
return 0
|
||||
else:
|
||||
selected_changes = changes
|
||||
|
||||
# 应用修改
|
||||
modified_content = content
|
||||
for old, new, _ in selected_changes:
|
||||
modified_content = modified_content.replace(old, new)
|
||||
|
||||
# 写入文件
|
||||
try:
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(modified_content)
|
||||
print(f"{Colors.GREEN}成功应用 {len(selected_changes)} 个修改到文件{Colors.ENDC}")
|
||||
return len(selected_changes)
|
||||
except Exception as e:
|
||||
print(f"{Colors.FAIL}错误: 无法写入文件 - {e}{Colors.ENDC}")
|
||||
return 0
|
||||
|
||||
def scan_directory(dir_path, root_dir, dry_run=False, auto_confirm=False):
|
||||
"""
|
||||
扫描目录中的所有.mdx文件
|
||||
|
||||
Args:
|
||||
dir_path: 要扫描的目录路径
|
||||
root_dir: 项目根目录
|
||||
dry_run: 是否只预览修改
|
||||
auto_confirm: 是否自动确认所有修改
|
||||
|
||||
Returns:
|
||||
处理的文件数量,修改的链接总数
|
||||
"""
|
||||
file_count = 0
|
||||
total_changes = 0
|
||||
|
||||
print(f"{Colors.HEADER}扫描目录: {dir_path}{Colors.ENDC}")
|
||||
|
||||
# 获取所有.mdx文件
|
||||
mdx_files = []
|
||||
for root, _, files in os.walk(dir_path):
|
||||
for file in files:
|
||||
if file.endswith('.mdx'):
|
||||
mdx_files.append(os.path.join(root, file))
|
||||
|
||||
if not mdx_files:
|
||||
print(f"{Colors.WARNING}在目录中未找到.mdx文件{Colors.ENDC}")
|
||||
return 0, 0
|
||||
|
||||
print(f"{Colors.BLUE}找到 {len(mdx_files)} 个.mdx文件{Colors.ENDC}")
|
||||
|
||||
# 处理每个文件
|
||||
for file_path in mdx_files:
|
||||
# 显示文件的相对路径
|
||||
rel_path = os.path.relpath(file_path, root_dir)
|
||||
print(f"\n{Colors.BOLD}处理文件 ({file_count+1}/{len(mdx_files)}): {rel_path}{Colors.ENDC}")
|
||||
|
||||
# 询问是否处理此文件
|
||||
if not auto_confirm:
|
||||
response = input(f"{Colors.BOLD}是否处理此文件? (y/n/q-退出): {Colors.ENDC}")
|
||||
if response.lower() == 'n':
|
||||
print(f"{Colors.BLUE}跳过此文件{Colors.ENDC}")
|
||||
continue
|
||||
elif response.lower() == 'q':
|
||||
print(f"{Colors.BLUE}退出处理{Colors.ENDC}")
|
||||
break
|
||||
|
||||
# 处理文件
|
||||
changes = process_file(file_path, root_dir, dry_run, auto_confirm)
|
||||
if changes > 0:
|
||||
file_count += 1
|
||||
total_changes += changes
|
||||
|
||||
return file_count, total_changes
|
||||
|
||||
def main():
|
||||
"""主程序入口"""
|
||||
# 确定项目根目录
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir) # 脚本在scripts目录下,上一级是项目根目录
|
||||
|
||||
# 显示欢迎信息
|
||||
print(f"\n{Colors.HEADER}{'='*60}{Colors.ENDC}")
|
||||
print(f"{Colors.HEADER} Mintlify文档链接修复工具 {Colors.ENDC}")
|
||||
print(f"{Colors.HEADER}{'='*60}{Colors.ENDC}")
|
||||
print(f"项目根目录: {project_root}\n")
|
||||
|
||||
# 交互式菜单
|
||||
while True:
|
||||
print(f"\n{Colors.BOLD}请选择操作模式:{Colors.ENDC}")
|
||||
print("1. 处理单个文件")
|
||||
print("2. 处理指定目录中的所有.mdx文件")
|
||||
print("3. 退出")
|
||||
|
||||
choice = input(f"{Colors.BOLD}请输入选项 (1-3): {Colors.ENDC}")
|
||||
|
||||
if choice == '1':
|
||||
# 处理单个文件
|
||||
file_path = input(f"{Colors.BOLD}请输入文件路径 (相对于项目根目录): {Colors.ENDC}")
|
||||
file_path = os.path.join(project_root, file_path)
|
||||
|
||||
if not os.path.isfile(file_path):
|
||||
print(f"{Colors.FAIL}错误: 文件不存在{Colors.ENDC}")
|
||||
continue
|
||||
|
||||
# 询问是否只预览修改
|
||||
dry_run = input(f"{Colors.BOLD}是否只预览修改而不实际写入? (y/n): {Colors.ENDC}").lower() == 'y'
|
||||
|
||||
# 处理文件
|
||||
changes = process_file(file_path, project_root, dry_run)
|
||||
|
||||
print(f"\n{Colors.GREEN}处理完成! 共发现 {changes} 个需要修改的链接{Colors.ENDC}")
|
||||
|
||||
elif choice == '2':
|
||||
# 处理目录
|
||||
dir_path = input(f"{Colors.BOLD}请输入目录路径 (相对于项目根目录): {Colors.ENDC}")
|
||||
dir_path = os.path.join(project_root, dir_path)
|
||||
|
||||
if not os.path.isdir(dir_path):
|
||||
print(f"{Colors.FAIL}错误: 目录不存在{Colors.ENDC}")
|
||||
continue
|
||||
|
||||
# 询问是否只预览修改
|
||||
dry_run = input(f"{Colors.BOLD}是否只预览修改而不实际写入? (y/n): {Colors.ENDC}").lower() == 'y'
|
||||
|
||||
# 询问是否自动确认所有修改
|
||||
auto_confirm = input(f"{Colors.BOLD}是否自动确认所有修改? (y/n): {Colors.ENDC}").lower() == 'y'
|
||||
|
||||
# 处理目录
|
||||
file_count, total_changes = scan_directory(dir_path, project_root, dry_run, auto_confirm)
|
||||
|
||||
print(f"\n{Colors.GREEN}处理完成! 共处理 {file_count} 个文件,修改了 {total_changes} 个链接{Colors.ENDC}")
|
||||
|
||||
elif choice == '3':
|
||||
# 退出
|
||||
print(f"{Colors.BLUE}感谢使用,再见!{Colors.ENDC}")
|
||||
break
|
||||
|
||||
else:
|
||||
print(f"{Colors.WARNING}无效选项,请重试{Colors.ENDC}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -37,7 +37,7 @@ class Colors:
|
||||
# 1. Markdown格式: 
|
||||
# 2. HTML格式: <img src="https://assets-docs.dify.ai/..." alt="..." />
|
||||
# 3. Frame标签中的图片: <Frame>...<img src="https://assets-docs.dify.ai/..." />...</Frame>
|
||||
# 4. 相对路径图片: 
|
||||
# 4. 相对路径图片: 
|
||||
|
||||
# Markdown格式图片
|
||||
MD_IMG_PATTERN = re.compile(r'!\[(.*?)\]\((https?://[^)]+|/[^)]+)\)')
|
||||
@@ -49,7 +49,7 @@ HTML_IMG_PATTERN = re.compile(r'<img\s+src="([^"]+)"[^>]*>')
|
||||
ASSETS_URL_PREFIX = 'https://assets-docs.dify.ai/'
|
||||
|
||||
# 相对路径特征
|
||||
RELATIVE_PATH_PREFIX = '/zh-cn/user-guide/.gitbook/assets/'
|
||||
RELATIVE_PATH_PREFIX = '/zh-cn/'
|
||||
|
||||
def find_corresponding_file(source_file: str, source_dir: str, target_dir: str) -> Optional[str]:
|
||||
"""查找源文件在目标目录中的对应文件"""
|
||||
@@ -120,6 +120,18 @@ def extract_image_links(content: str) -> List[Tuple[str, str, str]]:
|
||||
|
||||
return images
|
||||
|
||||
def generate_markdown_replacement(old_match: str, old_url: str, new_url: str) -> str:
|
||||
"""
|
||||
生成Markdown图片标签的替换内容
|
||||
"""
|
||||
return old_match.replace(old_url, new_url)
|
||||
|
||||
def generate_html_replacement(old_match: str, old_url: str, new_url: str) -> str:
|
||||
"""
|
||||
生成HTML图片标签的替换内容
|
||||
"""
|
||||
return old_match.replace(f'src="{old_url}"', f'src="{new_url}"')
|
||||
|
||||
def generate_frame_replacement(old_content: str, new_image_url: str) -> str:
|
||||
"""
|
||||
生成Frame标签的替换内容
|
||||
@@ -171,8 +183,16 @@ def sync_image_links(source_file: str, target_file: str, dry_run: bool = False)
|
||||
online_images = [(match, alt, url) for match, alt, url in source_images if url.startswith(ASSETS_URL_PREFIX)]
|
||||
|
||||
if not online_images:
|
||||
print(f"{Colors.WARNING}警告: 源文件中没有找到在线图片链接{Colors.ENDC}")
|
||||
return 0, []
|
||||
|
||||
# 提取目标文件中的图片链接
|
||||
target_images = extract_image_links(target_content)
|
||||
relative_images = [(match, alt, url) for match, alt, url in target_images if url.startswith(RELATIVE_PATH_PREFIX)]
|
||||
|
||||
if not relative_images:
|
||||
print(f"{Colors.BLUE}目标文件中没有找到相对路径图片链接{Colors.ENDC}")
|
||||
|
||||
# 处理目标文件中的内容
|
||||
new_content = target_content
|
||||
modified_links = []
|
||||
@@ -183,7 +203,7 @@ def sync_image_links(source_file: str, target_file: str, dry_run: bool = False)
|
||||
|
||||
# 处理每个在线图片链接
|
||||
for _, _, image_url in online_images:
|
||||
# 查找目标文件中可能的相对路径格式
|
||||
# 1. 首先处理Frame标签中的图片
|
||||
for match in frame_matches:
|
||||
frame_text = match.group(0)
|
||||
old_url = match.group(1)
|
||||
@@ -197,6 +217,25 @@ def sync_image_links(source_file: str, target_file: str, dry_run: bool = False)
|
||||
new_frame = generate_frame_replacement(frame_text, image_url)
|
||||
new_content = new_content.replace(frame_text, new_frame)
|
||||
modified_links.append((old_url, image_url))
|
||||
|
||||
# 2. 处理Markdown格式的图片
|
||||
for match, alt, url in relative_images:
|
||||
# 跳过已经是在线链接的图片
|
||||
if url.startswith(ASSETS_URL_PREFIX):
|
||||
continue
|
||||
|
||||
# 如果是相对路径,替换为在线链接
|
||||
if url.startswith('/'):
|
||||
if "!(" in match or "![" in match:
|
||||
# Markdown格式
|
||||
new_md = generate_markdown_replacement(match, url, image_url)
|
||||
new_content = new_content.replace(match, new_md)
|
||||
else:
|
||||
# HTML格式
|
||||
new_html = generate_html_replacement(match, url, image_url)
|
||||
new_content = new_content.replace(match, new_html)
|
||||
|
||||
modified_links.append((url, image_url))
|
||||
|
||||
# 如果是预览模式,不写入修改
|
||||
if dry_run:
|
||||
@@ -225,6 +264,11 @@ def process_file(source_file: str, source_dir: str, target_dir: str, dry_run: bo
|
||||
"""
|
||||
print(f"{Colors.HEADER}处理文件: {source_file}{Colors.ENDC}")
|
||||
|
||||
# 检查源文件是否存在
|
||||
if not os.path.isfile(source_file):
|
||||
print(f"{Colors.FAIL}错误: 源文件不存在: {source_file}{Colors.ENDC}")
|
||||
return False, 0
|
||||
|
||||
# 查找对应文件
|
||||
target_file = find_corresponding_file(source_file, source_dir, target_dir)
|
||||
|
||||
@@ -334,8 +378,11 @@ def main():
|
||||
|
||||
if choice == '1':
|
||||
# 处理单个文件
|
||||
source_file = input(f"{Colors.BOLD}请输入源文件路径 (相对于源目录): {Colors.ENDC}")
|
||||
source_file = os.path.join(default_source_dir, source_file)
|
||||
source_file = input(f"{Colors.BOLD}请输入源文件路径: {Colors.ENDC}")
|
||||
|
||||
# 如果用户输入的是相对路径,则转换为绝对路径
|
||||
if not os.path.isabs(source_file):
|
||||
source_file = os.path.join(default_source_dir, source_file)
|
||||
|
||||
if not os.path.isfile(source_file):
|
||||
print(f"{Colors.FAIL}错误: 文件不存在: {source_file}{Colors.ENDC}")
|
||||
@@ -356,8 +403,11 @@ def main():
|
||||
|
||||
elif choice == '2':
|
||||
# 处理目录
|
||||
dir_path = input(f"{Colors.BOLD}请输入要处理的源目录路径 (相对于源目录): {Colors.ENDC}")
|
||||
dir_path = os.path.join(default_source_dir, dir_path)
|
||||
dir_path = input(f"{Colors.BOLD}请输入要处理的源目录路径: {Colors.ENDC}")
|
||||
|
||||
# 如果用户输入的是相对路径,则转换为绝对路径
|
||||
if not os.path.isabs(dir_path):
|
||||
dir_path = os.path.join(default_source_dir, dir_path)
|
||||
|
||||
if not os.path.isdir(dir_path):
|
||||
print(f"{Colors.FAIL}错误: 目录不存在: {dir_path}{Colors.ENDC}")
|
||||
@@ -386,4 +436,4 @@ def main():
|
||||
print(f"{Colors.WARNING}无效选项,请重试{Colors.ENDC}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
99
scripts/test_conversion.py
Normal file
99
scripts/test_conversion.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
测试脚本,用于测试图片格式转换
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Tuple, List
|
||||
|
||||
# 匹配Frame标签中的图片
|
||||
FRAME_IMG_PATTERN = re.compile(
|
||||
r'<Frame(?:\s+caption="([^"]*)")?(?:\s+width="([^"]*)")?\s*>\s*'
|
||||
r'<img\s+src="([^"]+)"(?:\s+alt="([^"]*)")?\s*(?:\/\s*>|\/ >|>\s*<\/img>)\s*'
|
||||
r'<\/Frame>',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
def convert_frame_to_markdown(content: str) -> Tuple[str, List[Tuple[str, str, str]]]:
|
||||
"""
|
||||
将Frame标签中的图片转换为Markdown或HTML格式
|
||||
|
||||
Args:
|
||||
content: 文件内容
|
||||
|
||||
Returns:
|
||||
Tuple[转换后的内容, 替换记录列表]
|
||||
"""
|
||||
replacements = []
|
||||
|
||||
def replace_frame(match):
|
||||
caption = match.group(1) or ""
|
||||
width = match.group(2) # 可能为None
|
||||
src = match.group(3)
|
||||
alt = match.group(4) or caption or ""
|
||||
|
||||
# 原始内容
|
||||
original = match.group(0)
|
||||
|
||||
# 转换格式
|
||||
if width:
|
||||
# 带宽度的转为HTML格式
|
||||
new_format = "HTML"
|
||||
markdown = f"""<img
|
||||
src="{src}"
|
||||
width="{width}"
|
||||
className="mx-auto"
|
||||
alt="{alt}"
|
||||
/>"""
|
||||
else:
|
||||
# 不带宽度的转为Markdown格式
|
||||
new_format = "Markdown"
|
||||
markdown = f""
|
||||
|
||||
# 记录替换
|
||||
replacements.append((original, markdown, new_format))
|
||||
|
||||
return markdown
|
||||
|
||||
# 执行替换
|
||||
new_content = FRAME_IMG_PATTERN.sub(replace_frame, content)
|
||||
|
||||
return new_content, replacements
|
||||
|
||||
# 测试
|
||||
test_file = "/Users/allen/Documents/dify-docs-mintlify/zh-hans/guides/workflow/nodes/ifelse.mdx"
|
||||
|
||||
# 读取文件
|
||||
with open(test_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 测试匹配
|
||||
matches = FRAME_IMG_PATTERN.findall(content)
|
||||
print(f"找到 {len(matches)} 个匹配")
|
||||
|
||||
# 打印匹配详情
|
||||
for i, match in enumerate(matches):
|
||||
caption, width, src, alt = match
|
||||
print(f"Match {i+1}:")
|
||||
print(f" caption: '{caption}'")
|
||||
print(f" width: '{width}'")
|
||||
print(f" src: '{src}'")
|
||||
print(f" alt: '{alt}'")
|
||||
|
||||
# 测试转换
|
||||
new_content, replacements = convert_frame_to_markdown(content)
|
||||
|
||||
# 打印替换详情
|
||||
print(f"\n找到 {len(replacements)} 个需要替换的内容")
|
||||
for i, (original, new, format_type) in enumerate(replacements):
|
||||
print(f"替换 {i+1} ({format_type}):")
|
||||
print(f"原始: {original[:100]}...")
|
||||
print(f"新的: {new}")
|
||||
print()
|
||||
|
||||
# 如果找到替换内容,则写入文件
|
||||
if replacements:
|
||||
print("替换后的内容示例:")
|
||||
# 显示部分替换后的内容
|
||||
for line in new_content.split('\n')[:20]:
|
||||
print(line)
|
||||
Reference in New Issue
Block a user