mirror of
https://github.com/langgenius/dify-docs.git
synced 2026-03-27 13:28:32 +07:00
268 lines
9.9 KiB
Python
268 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
||
import os
|
||
import re
|
||
import argparse
|
||
from pathlib import Path
|
||
from typing import List, Tuple, Dict, Set
|
||
import time
|
||
import sys
|
||
|
||
# 颜色代码,用于终端输出
|
||
class Colors:
|
||
HEADER = '\033[95m'
|
||
BLUE = '\033[94m'
|
||
GREEN = '\033[92m'
|
||
WARNING = '\033[93m'
|
||
FAIL = '\033[91m'
|
||
ENDC = '\033[0m'
|
||
BOLD = '\033[1m'
|
||
UNDERLINE = '\033[4m'
|
||
|
||
def log_info(message):
|
||
"""输出信息日志"""
|
||
print(f"{Colors.BLUE}[INFO]{Colors.ENDC} {message}")
|
||
|
||
def log_warning(message):
|
||
"""输出警告日志"""
|
||
print(f"{Colors.WARNING}[WARNING]{Colors.ENDC} {message}")
|
||
|
||
def log_error(message):
|
||
"""输出错误日志"""
|
||
print(f"{Colors.FAIL}[ERROR]{Colors.ENDC} {message}")
|
||
|
||
def log_success(message):
|
||
"""输出成功日志"""
|
||
print(f"{Colors.GREEN}[SUCCESS]{Colors.ENDC} {message}")
|
||
|
||
def find_all_md_files(base_dir: str) -> List[Path]:
|
||
"""查找指定目录下的所有 .md 和 .mdx 文件"""
|
||
md_files = []
|
||
base_path = Path(base_dir)
|
||
|
||
for ext in ["*.md", "*.mdx"]:
|
||
md_files.extend(base_path.glob(f"**/{ext}"))
|
||
|
||
return md_files
|
||
|
||
def extract_links(file_content: str) -> List[Tuple[str, str, str]]:
|
||
"""从文件内容中提取所有链接
|
||
返回格式: [(完整匹配文本, 链接文本, 链接URL)]
|
||
"""
|
||
links = []
|
||
|
||
# 提取 Markdown 链接 [text](url)
|
||
md_links = re.findall(r'\[(.*?)\]\((.*?)\)', file_content)
|
||
for text, url in md_links:
|
||
full_match = f"[{text}]({url})"
|
||
links.append((full_match, text, url))
|
||
|
||
# 提取 <a> 标签链接
|
||
a_links = re.findall(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)<\/a>', file_content)
|
||
for url, text in a_links:
|
||
full_match = f'<a href="{url}">{text}</a>'
|
||
links.append((full_match, text, url))
|
||
|
||
# 提取 Mintlify Card 组件链接
|
||
card_links = re.findall(r'<Card\s+title="([^"]*)"[^>]*\s+href="([^"]*)"[^>]*>(.*?)<\/Card>', file_content, re.DOTALL)
|
||
for title, url, content in card_links:
|
||
full_match = f'<Card title="{title}" href="{url}">{content}</Card>'
|
||
links.append((full_match, title, url))
|
||
|
||
return links
|
||
|
||
def check_link_extensions(links: List[Tuple[str, str, str]],
|
||
file_path: Path,
|
||
all_files: Dict[str, Path],
|
||
base_dir: Path) -> List[Tuple[str, str, str, str]]:
|
||
"""检查链接是否包含不需要的扩展名
|
||
返回格式: [(完整匹配文本, 链接文本, 原始URL, 修复后URL)]
|
||
"""
|
||
issues = []
|
||
|
||
for full_match, text, url in links:
|
||
# 忽略外部链接和锚点链接
|
||
if url.startswith(('http://', 'https://', '#', 'mailto:', 'tel:')):
|
||
continue
|
||
|
||
# 忽略以 / 开头的绝对路径
|
||
if url.startswith('/'):
|
||
continue
|
||
|
||
# 检查链接是否包含 .md 或 .mdx 扩展名
|
||
if url.endswith('.md') or url.endswith('.mdx'):
|
||
# 计算修复后的 URL
|
||
fixed_url = url.rsplit('.', 1)[0]
|
||
issues.append((full_match, text, url, fixed_url))
|
||
|
||
return issues
|
||
|
||
def fix_links(file_path: Path, issues: List[Tuple[str, str, str, str]], dry_run: bool = True) -> bool:
|
||
"""修复文件中的链接问题
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
issues: 需要修复的问题列表 [(完整匹配文本, 链接文本, 原始URL, 修复后URL)]
|
||
dry_run: 如果为 True,只显示将要进行的修改,不实际修改文件
|
||
|
||
Returns:
|
||
bool: 是否进行了修改
|
||
"""
|
||
if not issues:
|
||
return False
|
||
|
||
# 读取文件内容
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
modified_content = content
|
||
|
||
# 遍历所有问题并修复
|
||
for full_match, text, old_url, new_url in issues:
|
||
if "Card" in full_match:
|
||
# 修复 Card 组件链接
|
||
old_pattern = f'href="{old_url}"'
|
||
new_pattern = f'href="{new_url}"'
|
||
modified_content = modified_content.replace(old_pattern, new_pattern)
|
||
elif "<a" in full_match:
|
||
# 修复 <a> 标签链接
|
||
old_pattern = f'href="{old_url}"'
|
||
new_pattern = f'href="{new_url}"'
|
||
modified_content = modified_content.replace(old_pattern, new_pattern)
|
||
else:
|
||
# 修复 Markdown 链接
|
||
old_pattern = f']({old_url})'
|
||
new_pattern = f']({new_url})'
|
||
modified_content = modified_content.replace(old_pattern, new_pattern)
|
||
|
||
# 如果内容有变化,写回文件
|
||
if modified_content != content and not dry_run:
|
||
with open(file_path, 'w', encoding='utf-8') as f:
|
||
f.write(modified_content)
|
||
return True
|
||
|
||
return not dry_run and modified_content != content
|
||
|
||
def process_file(file_path: Path, all_files: Dict[str, Path], base_dir: Path, args):
|
||
"""处理单个文件中的链接问题"""
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
links = extract_links(content)
|
||
issues = check_link_extensions(links, file_path, all_files, base_dir)
|
||
|
||
if issues:
|
||
rel_path = file_path.relative_to(base_dir)
|
||
print(f"\n{Colors.HEADER}{Colors.BOLD}检查文件: {rel_path}{Colors.ENDC}")
|
||
|
||
for i, (full_match, text, old_url, new_url) in enumerate(issues, 1):
|
||
print(f" {i}. 发现问题: {Colors.WARNING}{old_url}{Colors.ENDC} -> {Colors.GREEN}{new_url}{Colors.ENDC}")
|
||
|
||
# 询问用户是否修复
|
||
if not args.auto_fix:
|
||
choice = input(f"\n{Colors.BOLD}修复这些问题? (y/n/a/q): {Colors.ENDC}")
|
||
if choice.lower() == 'q': # q 代表退出脚本
|
||
log_info("用户请求退出脚本")
|
||
sys.exit(0)
|
||
elif choice.lower() == 'a': # a 代表全部修复,并设置 auto_fix 标志
|
||
args.auto_fix = True
|
||
|
||
if choice.lower() not in ('y', 'a'):
|
||
log_info(f"跳过修复 {rel_path}")
|
||
return False
|
||
|
||
# 修复问题
|
||
fixed = fix_links(file_path, issues, dry_run=args.dry_run)
|
||
|
||
if args.dry_run:
|
||
log_info(f"已检测到 {len(issues)} 个需要修复的链接 (模拟运行,实际未修改)")
|
||
elif fixed:
|
||
log_success(f"已修复 {len(issues)} 个链接问题")
|
||
|
||
# 如果不是自动修复模式,在每个文件处理完后暂停一下,让用户有时间查看结果
|
||
if not args.auto_fix and fixed and not args.dry_run:
|
||
input(f"\n{Colors.BOLD}已完成修复,按回车继续下一个文件...{Colors.ENDC}")
|
||
|
||
return fixed
|
||
|
||
return False
|
||
|
||
except Exception as e:
|
||
log_error(f"处理文件 {file_path} 时出错: {str(e)}")
|
||
return False
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description='检查并修复文档中的链接问题')
|
||
parser.add_argument('doc_path', nargs='?', help='文档根目录路径')
|
||
parser.add_argument('--dry-run', action='store_true', help='只显示将要修改的内容,不实际修改文件')
|
||
parser.add_argument('--auto-fix', action='store_true', help='自动修复所有问题,不询问')
|
||
args = parser.parse_args()
|
||
|
||
# 如果命令行未提供路径,则交互式询问
|
||
if args.doc_path is None:
|
||
doc_path = input(f"{Colors.BOLD}请输入文档根目录路径: {Colors.ENDC}")
|
||
args.doc_path = doc_path.strip()
|
||
|
||
base_dir = Path(args.doc_path)
|
||
|
||
if not base_dir.exists() or not base_dir.is_dir():
|
||
log_error(f"指定的目录 '{args.doc_path}' 不存在或不是一个目录")
|
||
return 1
|
||
|
||
# 添加确认步骤
|
||
print(f"\n{Colors.BOLD}将要扫描的目录:{Colors.ENDC} {Colors.GREEN}{base_dir}{Colors.ENDC}")
|
||
if args.dry_run:
|
||
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}仅检查,不修改文件{Colors.ENDC}")
|
||
elif args.auto_fix:
|
||
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}自动修复所有问题{Colors.ENDC}")
|
||
else:
|
||
print(f"{Colors.BOLD}模式:{Colors.ENDC} {Colors.BLUE}交互式修复{Colors.ENDC}")
|
||
|
||
confirm = input(f"\n{Colors.BOLD}确认开始扫描? (y/n): {Colors.ENDC}")
|
||
if confirm.lower() != 'y':
|
||
log_info("操作已取消")
|
||
return 0
|
||
|
||
log_info(f"开始扫描目录: {base_dir}")
|
||
|
||
# 查找所有文档文件
|
||
all_files_list = find_all_md_files(base_dir)
|
||
log_info(f"共找到 {len(all_files_list)} 个文档文件")
|
||
|
||
# 创建文件路径映射,用于链接验证
|
||
all_files = {}
|
||
for file_path in all_files_list:
|
||
rel_path = file_path.relative_to(base_dir)
|
||
all_files[str(rel_path)] = file_path
|
||
|
||
# 处理所有文件
|
||
fixed_count = 0
|
||
total_files = len(all_files_list)
|
||
|
||
try:
|
||
for i, file_path in enumerate(all_files_list, 1):
|
||
# 清空当前行并显示进度
|
||
sys.stdout.write("\r" + " " * 80) # 清空当前行
|
||
sys.stdout.write(f"\r{Colors.BOLD}进度: {i}/{total_files} ({i/total_files*100:.1f}%){Colors.ENDC}")
|
||
sys.stdout.flush()
|
||
|
||
# 处理文件,如果有修复则增加计数
|
||
if process_file(file_path, all_files, base_dir, args):
|
||
fixed_count += 1
|
||
except KeyboardInterrupt:
|
||
print("\n")
|
||
log_warning("用户中断了处理过程")
|
||
# 继续执行后面的代码,显示已完成的统计信息
|
||
|
||
print("\n")
|
||
log_info(f"扫描完成,共处理 {total_files} 个文件")
|
||
|
||
if args.dry_run:
|
||
log_info(f"发现 {fixed_count} 个文件中有链接问题需要修复")
|
||
else:
|
||
log_success(f"已修复 {fixed_count} 个文件中的链接问题")
|
||
|
||
return 0
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main()) |