Files
dify-docs/scripts/doc_link_checker.py
2025-03-26 00:42:17 +08:00

268 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
import os
import re
import argparse
from pathlib import Path
from typing import List, Tuple, Dict, Set
import time
import sys
# 颜色代码,用于终端输出
class Colors:
HEADER = '\033[95m'
BLUE = '\033[94m'
GREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def log_info(message):
"""输出信息日志"""
print(f"{Colors.BLUE}[INFO]{Colors.ENDC} {message}")
def log_warning(message):
"""输出警告日志"""
print(f"{Colors.WARNING}[WARNING]{Colors.ENDC} {message}")
def log_error(message):
"""输出错误日志"""
print(f"{Colors.FAIL}[ERROR]{Colors.ENDC} {message}")
def log_success(message):
"""输出成功日志"""
print(f"{Colors.GREEN}[SUCCESS]{Colors.ENDC} {message}")
def find_all_md_files(base_dir: str) -> List[Path]:
"""查找指定目录下的所有 .md 和 .mdx 文件"""
md_files = []
base_path = Path(base_dir)
for ext in ["*.md", "*.mdx"]:
md_files.extend(base_path.glob(f"**/{ext}"))
return md_files
def extract_links(file_content: str) -> List[Tuple[str, str, str]]:
"""从文件内容中提取所有链接
返回格式: [(完整匹配文本, 链接文本, 链接URL)]
"""
links = []
# 提取 Markdown 链接 [text](url)
md_links = re.findall(r'\[(.*?)\]\((.*?)\)', file_content)
for text, url in md_links:
full_match = f"[{text}]({url})"
links.append((full_match, text, url))
# 提取 <a> 标签链接
a_links = re.findall(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)<\/a>', file_content)
for url, text in a_links:
full_match = f'<a href="{url}">{text}</a>'
links.append((full_match, text, url))
# 提取 Mintlify Card 组件链接
card_links = re.findall(r'<Card\s+title="([^"]*)"[^>]*\s+href="([^"]*)"[^>]*>(.*?)<\/Card>', file_content, re.DOTALL)
for title, url, content in card_links:
full_match = f'<Card title="{title}" href="{url}">{content}</Card>'
links.append((full_match, title, url))
return links
def check_link_extensions(links: List[Tuple[str, str, str]],
file_path: Path,
all_files: Dict[str, Path],
base_dir: Path) -> List[Tuple[str, str, str, str]]:
"""检查链接是否包含不需要的扩展名
返回格式: [(完整匹配文本, 链接文本, 原始URL, 修复后URL)]
"""
issues = []
for full_match, text, url in links:
# 忽略外部链接和锚点链接
if url.startswith(('http://', 'https://', '#', 'mailto:', 'tel:')):
continue
# 忽略以 / 开头的绝对路径
if url.startswith('/'):
continue
# 检查链接是否包含 .md 或 .mdx 扩展名
if url.endswith('.md') or url.endswith('.mdx'):
# 计算修复后的 URL
fixed_url = url.rsplit('.', 1)[0]
issues.append((full_match, text, url, fixed_url))
return issues
def fix_links(file_path: Path, issues: List[Tuple[str, str, str, str]], dry_run: bool = True) -> bool:
"""修复文件中的链接问题
Args:
file_path: 文件路径
issues: 需要修复的问题列表 [(完整匹配文本, 链接文本, 原始URL, 修复后URL)]
dry_run: 如果为 True只显示将要进行的修改不实际修改文件
Returns:
bool: 是否进行了修改
"""
if not issues:
return False
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
modified_content = content
# 遍历所有问题并修复
for full_match, text, old_url, new_url in issues:
if "Card" in full_match:
# 修复 Card 组件链接
old_pattern = f'href="{old_url}"'
new_pattern = f'href="{new_url}"'
modified_content = modified_content.replace(old_pattern, new_pattern)
elif "<a" in full_match:
# 修复 <a> 标签链接
old_pattern = f'href="{old_url}"'
new_pattern = f'href="{new_url}"'
modified_content = modified_content.replace(old_pattern, new_pattern)
else:
# 修复 Markdown 链接
old_pattern = f']({old_url})'
new_pattern = f']({new_url})'
modified_content = modified_content.replace(old_pattern, new_pattern)
# 如果内容有变化,写回文件
if modified_content != content and not dry_run:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(modified_content)
return True
return not dry_run and modified_content != content
def process_file(file_path: Path, all_files: Dict[str, Path], base_dir: Path, args):
"""处理单个文件中的链接问题"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
links = extract_links(content)
issues = check_link_extensions(links, file_path, all_files, base_dir)
if issues:
rel_path = file_path.relative_to(base_dir)
print(f"\n{Colors.HEADER}{Colors.BOLD}检查文件: {rel_path}{Colors.ENDC}")
for i, (full_match, text, old_url, new_url) in enumerate(issues, 1):
print(f" {i}. 发现问题: {Colors.WARNING}{old_url}{Colors.ENDC} -> {Colors.GREEN}{new_url}{Colors.ENDC}")
# 询问用户是否修复
if not args.auto_fix:
choice = input(f"\n{Colors.BOLD}修复这些问题? (y/n/a/q): {Colors.ENDC}")
if choice.lower() == 'q': # q 代表退出脚本
log_info("用户请求退出脚本")
sys.exit(0)
elif choice.lower() == 'a': # a 代表全部修复,并设置 auto_fix 标志
args.auto_fix = True
if choice.lower() not in ('y', 'a'):
log_info(f"跳过修复 {rel_path}")
return False
# 修复问题
fixed = fix_links(file_path, issues, dry_run=args.dry_run)
if args.dry_run:
log_info(f"已检测到 {len(issues)} 个需要修复的链接 (模拟运行,实际未修改)")
elif fixed:
log_success(f"已修复 {len(issues)} 个链接问题")
# 如果不是自动修复模式,在每个文件处理完后暂停一下,让用户有时间查看结果
if not args.auto_fix and fixed and not args.dry_run:
input(f"\n{Colors.BOLD}已完成修复,按回车继续下一个文件...{Colors.ENDC}")
return fixed
return False
except Exception as e:
log_error(f"处理文件 {file_path} 时出错: {str(e)}")
return False
def main():
parser = argparse.ArgumentParser(description='检查并修复文档中的链接问题')
parser.add_argument('doc_path', nargs='?', help='文档根目录路径')
parser.add_argument('--dry-run', action='store_true', help='只显示将要修改的内容,不实际修改文件')
parser.add_argument('--auto-fix', action='store_true', help='自动修复所有问题,不询问')
args = parser.parse_args()
# 如果命令行未提供路径,则交互式询问
if args.doc_path is None:
doc_path = input(f"{Colors.BOLD}请输入文档根目录路径: {Colors.ENDC}")
args.doc_path = doc_path.strip()
base_dir = Path(args.doc_path)
if not base_dir.exists() or not base_dir.is_dir():
log_error(f"指定的目录 '{args.doc_path}' 不存在或不是一个目录")
return 1
# 添加确认步骤
print(f"\n{Colors.BOLD}将要扫描的目录:{Colors.ENDC} {Colors.GREEN}{base_dir}{Colors.ENDC}")
if args.dry_run:
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}仅检查,不修改文件{Colors.ENDC}")
elif args.auto_fix:
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}自动修复所有问题{Colors.ENDC}")
else:
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}交互式修复{Colors.ENDC}")
confirm = input(f"\n{Colors.BOLD}确认开始扫描? (y/n): {Colors.ENDC}")
if confirm.lower() != 'y':
log_info("操作已取消")
return 0
log_info(f"开始扫描目录: {base_dir}")
# 查找所有文档文件
all_files_list = find_all_md_files(base_dir)
log_info(f"共找到 {len(all_files_list)} 个文档文件")
# 创建文件路径映射,用于链接验证
all_files = {}
for file_path in all_files_list:
rel_path = file_path.relative_to(base_dir)
all_files[str(rel_path)] = file_path
# 处理所有文件
fixed_count = 0
total_files = len(all_files_list)
try:
for i, file_path in enumerate(all_files_list, 1):
# 清空当前行并显示进度
sys.stdout.write("\r" + " " * 80) # 清空当前行
sys.stdout.write(f"\r{Colors.BOLD}进度: {i}/{total_files} ({i/total_files*100:.1f}%){Colors.ENDC}")
sys.stdout.flush()
# 处理文件,如果有修复则增加计数
if process_file(file_path, all_files, base_dir, args):
fixed_count += 1
except KeyboardInterrupt:
print("\n")
log_warning("用户中断了处理过程")
# 继续执行后面的代码,显示已完成的统计信息
print("\n")
log_info(f"扫描完成,共处理 {total_files} 个文件")
if args.dry_run:
log_info(f"发现 {fixed_count} 个文件中有链接问题需要修复")
else:
log_success(f"已修复 {fixed_count} 个文件中的链接问题")
return 0
if __name__ == "__main__":
sys.exit(main())