mirror of
https://github.com/langgenius/dify-docs.git
synced 2026-03-27 13:28:32 +07:00
343 lines
12 KiB
Python
343 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
import os
|
|
import re
|
|
import json
|
|
import sys
|
|
import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set, Tuple, Optional
|
|
import logging
|
|
|
|
# 设置日志
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler("link_validation.log"),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ANSI 颜色代码
|
|
class Colors:
|
|
RESET = '\033[0m'
|
|
RED = '\033[91m'
|
|
GREEN = '\033[92m'
|
|
YELLOW = '\033[93m'
|
|
BLUE = '\033[94m'
|
|
MAGENTA = '\033[95m'
|
|
CYAN = '\033[96m'
|
|
BOLD = '\033[1m'
|
|
UNDERLINE = '\033[4m'
|
|
|
|
def load_docs_json(docs_json_path: str) -> dict:
|
|
"""加载 docs.json 文件并解析其中的路径信息"""
|
|
try:
|
|
with open(docs_json_path, 'r', encoding='utf-8') as f:
|
|
docs_data = json.load(f)
|
|
return docs_data
|
|
except Exception as e:
|
|
logger.error(f"加载 docs.json 时出错: {e}")
|
|
raise
|
|
|
|
def extract_valid_paths(docs_data: dict) -> Set[str]:
|
|
"""从 docs.json 中提取所有有效的文档路径"""
|
|
valid_paths = set()
|
|
|
|
# 递归函数用于处理嵌套结构
|
|
def extract_paths_from_object(obj):
|
|
if isinstance(obj, dict):
|
|
# 检查是否有页面路径
|
|
if "pages" in obj:
|
|
for page in obj["pages"]:
|
|
if isinstance(page, str) and not page.startswith(("http://", "https://")):
|
|
valid_paths.add(page)
|
|
elif isinstance(page, dict):
|
|
extract_paths_from_object(page)
|
|
# 处理其他可能的字典键
|
|
for key, value in obj.items():
|
|
if key != "pages": # 避免重复处理
|
|
extract_paths_from_object(value)
|
|
elif isinstance(obj, list):
|
|
# 处理列表中的每个元素
|
|
for item in obj:
|
|
extract_paths_from_object(item)
|
|
|
|
# 开始提取
|
|
extract_paths_from_object(docs_data)
|
|
return valid_paths
|
|
|
|
def find_md_files(path: str) -> List[Path]:
|
|
"""查找指定路径下的所有 .md 和 .mdx 文件"""
|
|
path_obj = Path(path)
|
|
if path_obj.is_file() and (path_obj.suffix in ['.md', '.mdx']):
|
|
return [path_obj]
|
|
|
|
md_files = []
|
|
for root, _, files in os.walk(path):
|
|
for file in files:
|
|
if file.endswith(('.md', '.mdx')):
|
|
md_files.append(Path(os.path.join(root, file)))
|
|
|
|
return md_files
|
|
|
|
def detect_file_language(file_path: Path) -> str:
|
|
"""根据文件路径检测语言"""
|
|
path_str = str(file_path)
|
|
if "/en/" in path_str:
|
|
return "en"
|
|
elif "/zh-hans/" in path_str:
|
|
return "zh-hans"
|
|
elif "/ja-jp/" in path_str:
|
|
return "ja-jp"
|
|
else:
|
|
# 默认为英文
|
|
return "en"
|
|
|
|
def check_link_exists(link: str, base_path: str, valid_paths: Set[str]) -> bool:
|
|
"""检查链接是否存在"""
|
|
# 跳过锚点链接、邮件链接和外部链接
|
|
if link.startswith(('#', 'mailto:', 'http://', 'https://')):
|
|
return True
|
|
|
|
# 处理以 / 开头的路径 (绝对路径)
|
|
if link.startswith('/'):
|
|
# 移除开头的 /
|
|
clean_link = link[1:]
|
|
|
|
# 检查在 valid_paths 中是否存在
|
|
if clean_link in valid_paths:
|
|
return True
|
|
|
|
# 移除扩展名后检查
|
|
link_without_ext, _ = os.path.splitext(clean_link)
|
|
if link_without_ext in valid_paths:
|
|
return True
|
|
|
|
# 检查实际文件是否存在
|
|
full_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), clean_link)
|
|
if os.path.exists(full_path):
|
|
return True
|
|
|
|
# 尝试添加 .mdx 或 .md 扩展名
|
|
if os.path.exists(full_path + '.mdx') or os.path.exists(full_path + '.md'):
|
|
return True
|
|
|
|
return False
|
|
|
|
# 处理相对路径
|
|
base_dir = os.path.dirname(base_path)
|
|
target_path = os.path.normpath(os.path.join(base_dir, link))
|
|
|
|
# 检查文件是否存在
|
|
if os.path.exists(target_path):
|
|
return True
|
|
|
|
# 移除扩展名
|
|
target_without_ext, ext = os.path.splitext(target_path)
|
|
|
|
# 尝试添加 .mdx 或 .md 扩展名
|
|
if os.path.exists(target_without_ext + '.mdx') or os.path.exists(target_without_ext + '.md'):
|
|
return True
|
|
|
|
# 检查是否是目录
|
|
if os.path.isdir(target_without_ext):
|
|
# 检查目录中是否有 index.mdx 或 README.mdx
|
|
if os.path.exists(os.path.join(target_without_ext, 'index.mdx')) or \
|
|
os.path.exists(os.path.join(target_without_ext, 'index.md')) or \
|
|
os.path.exists(os.path.join(target_without_ext, 'README.mdx')) or \
|
|
os.path.exists(os.path.join(target_without_ext, 'README.md')):
|
|
return True
|
|
|
|
# 如果以上都不匹配,则检查在 valid_paths 中是否有相似路径
|
|
rel_path = os.path.relpath(target_path, os.path.dirname(os.path.abspath(__file__)))
|
|
rel_path = rel_path.replace('\\', '/') # 统一路径分隔符
|
|
|
|
if rel_path in valid_paths:
|
|
return True
|
|
|
|
# 移除扩展名再检查
|
|
rel_path_without_ext, _ = os.path.splitext(rel_path)
|
|
if rel_path_without_ext in valid_paths:
|
|
return True
|
|
|
|
return False
|
|
|
|
def find_line_number(content: str, link_text: str, link_url: str) -> int:
|
|
"""查找链接在文件中的行号"""
|
|
lines = content.split('\n')
|
|
# 先尝试精确匹配
|
|
full_link = f"[{link_text}]({link_url})"
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
if full_link in line:
|
|
return i
|
|
|
|
# 如果精确匹配失败,尝试使用正则表达式进行模糊匹配
|
|
# 这可以处理链接包含特殊字符或包含属性的情况
|
|
try:
|
|
escaped_text = re.escape(link_text)
|
|
escaped_url = re.escape(link_url)
|
|
pattern = re.compile(f"\\[{escaped_text}\\]\\({escaped_url}")
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
if pattern.search(line):
|
|
return i
|
|
except Exception:
|
|
pass # 如果正则表达式出错,忽略并继续
|
|
|
|
# 如果以上方法都失败,尝试只匹配 URL
|
|
try:
|
|
for i, line in enumerate(lines, 1):
|
|
if f"]({link_url})" in line or f"]({link_url} " in line:
|
|
return i
|
|
except Exception:
|
|
pass
|
|
|
|
return 0 # 如果找不到,返回 0
|
|
|
|
def validate_links_in_file(file_path: Path, valid_paths: Set[str], base_dir: str) -> List[Dict]:
|
|
"""验证文件中的链接"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
logger.error(f"读取文件 {file_path} 时出错: {e}")
|
|
return []
|
|
|
|
# 提取当前文件的相对路径(相对于基础目录)
|
|
relative_file_path = os.path.relpath(file_path, base_dir)
|
|
|
|
# 找出所有 Markdown 链接
|
|
link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)(\s+"[^"]*")?\)')
|
|
matches = link_pattern.findall(content)
|
|
|
|
invalid_links = []
|
|
for match in matches:
|
|
link_text, link_url, _ = match
|
|
|
|
# 跳过锚点链接和邮件链接
|
|
if link_url.startswith(('#', 'mailto:')):
|
|
continue
|
|
|
|
# 跳过外部链接
|
|
if link_url.startswith(('http://', 'https://')):
|
|
continue
|
|
|
|
# 检查链接是否有效
|
|
if not check_link_exists(link_url, str(file_path), valid_paths):
|
|
line_number = find_line_number(content, link_text, link_url)
|
|
invalid_links.append({
|
|
"file": relative_file_path,
|
|
"link_text": link_text,
|
|
"link_url": link_url,
|
|
"line_number": line_number
|
|
})
|
|
|
|
return invalid_links
|
|
|
|
def generate_report(all_invalid_links: List[Dict], total_files_processed: int, base_dir: str) -> str:
|
|
"""生成报告并保存到文件"""
|
|
report_path = os.path.join(base_dir, "invalid_links_report.md")
|
|
|
|
with open(report_path, 'w', encoding='utf-8') as f:
|
|
f.write("# 无效链接报告\n\n")
|
|
f.write("生成时间: " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n\n")
|
|
f.write("## 汇总\n\n")
|
|
f.write(f"* 处理文件数: {total_files_processed}\n")
|
|
f.write(f"* 发现无效链接数: {len(all_invalid_links)}\n\n")
|
|
|
|
# 按文件分组
|
|
files_groups = {}
|
|
for link in all_invalid_links:
|
|
if link['file'] not in files_groups:
|
|
files_groups[link['file']] = []
|
|
files_groups[link['file']].append(link)
|
|
|
|
f.write("## 详细报告\n\n")
|
|
|
|
for file, links in files_groups.items():
|
|
f.write(f"### 文件: {file}\n\n")
|
|
f.write("| 行号 | 链接文本 | 链接 URL |\n")
|
|
f.write("|------|----------|----------|\n")
|
|
for link in sorted(links, key=lambda x: x['line_number']):
|
|
# 处理可能包含特殊字符的文本
|
|
link_text = link['link_text'].replace('|', '\\|')
|
|
link_url = link['link_url'].replace('|', '\\|')
|
|
line_num = link['line_number'] if link['line_number'] > 0 else "未知"
|
|
f.write(f"| {line_num} | {link_text} | {link_url} |\n")
|
|
f.write("\n")
|
|
|
|
return report_path
|
|
|
|
def main():
|
|
"""主函数"""
|
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
|
docs_json_path = os.path.join(base_dir, 'docs.json')
|
|
|
|
# 显示脚本标题
|
|
print(f"\n{Colors.CYAN}{Colors.BOLD}===== Dify 文档链接验证工具 V2 ====={Colors.RESET}\n")
|
|
|
|
# 加载 docs.json
|
|
try:
|
|
print(f"{Colors.BLUE}正在加载 docs.json...{Colors.RESET}")
|
|
docs_data = load_docs_json(docs_json_path)
|
|
valid_paths = extract_valid_paths(docs_data)
|
|
print(f"{Colors.GREEN}从 docs.json 中提取了 {Colors.YELLOW}{len(valid_paths)}{Colors.GREEN} 个有效路径{Colors.RESET}")
|
|
except Exception as e:
|
|
print(f"{Colors.RED}加载 docs.json 时出错: {e}{Colors.RESET}")
|
|
sys.exit(1)
|
|
|
|
# 获取用户输入的路径
|
|
path_input = input(f"{Colors.CYAN}请输入要验证的文件或目录路径(相对于当前目录): {Colors.RESET}")
|
|
target_path = os.path.join(base_dir, path_input)
|
|
|
|
if not os.path.exists(target_path):
|
|
print(f"{Colors.RED}路径不存在: {target_path}{Colors.RESET}")
|
|
sys.exit(1)
|
|
|
|
# 查找 Markdown 文件
|
|
md_files = find_md_files(target_path)
|
|
print(f"{Colors.GREEN}找到 {Colors.YELLOW}{len(md_files)}{Colors.GREEN} 个 Markdown 文件{Colors.RESET}")
|
|
|
|
# 验证每个文件中的链接
|
|
all_invalid_links = []
|
|
total_files_processed = 0
|
|
|
|
for file_path in md_files:
|
|
total_files_processed += 1
|
|
print(f"{Colors.BLUE}正在验证文件 ({total_files_processed}/{len(md_files)}): {Colors.YELLOW}{file_path}{Colors.RESET}")
|
|
|
|
invalid_links = validate_links_in_file(file_path, valid_paths, base_dir)
|
|
if invalid_links:
|
|
print(f"{Colors.RED}发现 {len(invalid_links)} 个无效链接{Colors.RESET}")
|
|
for link in invalid_links:
|
|
line_num = link['line_number'] if link['line_number'] > 0 else "未知行"
|
|
print(f" {Colors.YELLOW}{line_num}: {Colors.RED}[{link['link_text']}]({link['link_url']}){Colors.RESET}")
|
|
all_invalid_links.extend(invalid_links)
|
|
else:
|
|
print(f"{Colors.GREEN}所有链接验证通过{Colors.RESET}")
|
|
|
|
# 显示总结
|
|
print(f"\n{Colors.CYAN}{Colors.BOLD}===== 验证完成 ====={Colors.RESET}")
|
|
print(f"{Colors.GREEN}共处理了 {Colors.YELLOW}{total_files_processed}{Colors.GREEN} 个文件{Colors.RESET}")
|
|
|
|
if all_invalid_links:
|
|
print(f"{Colors.RED}发现 {Colors.YELLOW}{len(all_invalid_links)}{Colors.RED} 个无效链接{Colors.RESET}")
|
|
|
|
# 询问是否保存报告
|
|
save_report = input(f"{Colors.CYAN}是否保存无效链接报告到文件? (y/n): {Colors.RESET}").lower() == 'y'
|
|
if save_report:
|
|
report_path = generate_report(all_invalid_links, total_files_processed, base_dir)
|
|
print(f"{Colors.GREEN}报告已保存到: {Colors.YELLOW}{report_path}{Colors.RESET}")
|
|
else:
|
|
print(f"{Colors.GREEN}恭喜!所有链接验证通过{Colors.RESET}")
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print(f"\n{Colors.RED}用户中断,程序退出{Colors.RESET}")
|
|
sys.exit(0)
|