mirror of
https://github.com/langgenius/dify-docs.git
synced 2026-03-26 13:18:34 +07:00
242 lines
11 KiB
Python
242 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Link Checker for Markdown/MDX files
|
||
|
||
This script checks both online links and relative file paths in markdown files.
|
||
It verifies that online links are accessible and that relative paths exist in the filesystem.
|
||
"""
|
||
|
||
import re
|
||
import requests
|
||
import os
|
||
from pathlib import Path
|
||
import concurrent.futures
|
||
import argparse
|
||
import sys
|
||
from colorama import init, Fore, Style
|
||
|
||
# Initialize colorama for cross-platform colored terminal output
|
||
init()
|
||
|
||
class LinkChecker:
|
||
def __init__(self, base_dir, timeout=10, max_workers=10):
|
||
self.base_dir = Path(base_dir)
|
||
self.timeout = timeout
|
||
self.max_workers = max_workers
|
||
self.results = {"valid": [], "invalid": [], "skipped": []}
|
||
self.headers = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||
}
|
||
|
||
def extract_links_from_markdown(self, file_path):
|
||
"""Extract all links from a Markdown/MDX file, with line and column info"""
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
content = file.read()
|
||
|
||
link_infos = []
|
||
# Markdown links [text](url)
|
||
for match in re.finditer(r'\[.*?\]\((.*?)\)', content):
|
||
url = match.group(1)
|
||
start = match.start(1)
|
||
line = content.count('\n', 0, start) + 1
|
||
col = start - content.rfind('\n', 0, start)
|
||
link_infos.append((url, line, col))
|
||
# HTML links <a href="url">
|
||
for match in re.finditer(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"', content):
|
||
url = match.group(1)
|
||
start = match.start(1)
|
||
line = content.count('\n', 0, start) + 1
|
||
col = start - content.rfind('\n', 0, start)
|
||
link_infos.append((url, line, col))
|
||
return link_infos
|
||
|
||
def is_external_link(self, url):
|
||
"""Check if a link is an external URL"""
|
||
return url.startswith(('http://', 'https://', 'ftp://'))
|
||
|
||
def is_anchor_link(self, url):
|
||
"""Check if a link is a page anchor"""
|
||
return url.startswith('#')
|
||
|
||
def is_mail_link(self, url):
|
||
"""Check if a link is a mailto link"""
|
||
return url.startswith('mailto:')
|
||
|
||
def check_online_link(self, link):
|
||
"""Check if an online link is accessible"""
|
||
try:
|
||
response = requests.head(link, allow_redirects=True, timeout=self.timeout, headers=self.headers)
|
||
|
||
# If HEAD request fails, try GET request
|
||
if response.status_code >= 400:
|
||
response = requests.get(link, timeout=self.timeout, headers=self.headers)
|
||
|
||
if response.status_code < 400:
|
||
return (link, True, f"HTTP {response.status_code}")
|
||
else:
|
||
return (link, False, f"HTTP {response.status_code}")
|
||
except requests.exceptions.Timeout:
|
||
return (link, False, "Timeout")
|
||
except requests.exceptions.ConnectionError:
|
||
return (link, False, "Connection Error")
|
||
except Exception as e:
|
||
return (link, False, str(e))
|
||
|
||
def check_local_path(self, link, file_path):
|
||
"""Check if a local file path exists"""
|
||
try:
|
||
# 如果有锚点,去掉锚点部分
|
||
if '#' in link:
|
||
link = link.split('#')[0]
|
||
current_file_dir = Path(file_path).parent
|
||
lang_dirs = ["zh-hans", "en", "ja-jp"]
|
||
# Handle different types of relative paths
|
||
if link.startswith('/'):
|
||
# 检查是否以 /zh-hans/、/en/、/ja-jp/ 开头
|
||
for lang in lang_dirs:
|
||
prefix = f'/{lang}/'
|
||
if link.startswith(prefix):
|
||
link = link[len(prefix):] # 去掉前导 /zh-hans/ 等
|
||
break
|
||
target_path = self.base_dir / link.lstrip('/')
|
||
else:
|
||
# Path relative to the current file
|
||
target_path = (current_file_dir / link).resolve()
|
||
# Handle paths without extensions (try adding .mdx or .md)
|
||
if not os.path.splitext(link)[1]:
|
||
# Path has no extension, try adding .mdx or .md
|
||
if target_path.with_suffix('.mdx').exists():
|
||
return (link, True, str(target_path.with_suffix('.mdx')))
|
||
elif target_path.with_suffix('.md').exists():
|
||
return (link, True, str(target_path.with_suffix('.md')))
|
||
else:
|
||
# Check if the directory exists
|
||
if target_path.exists() and target_path.is_dir():
|
||
# Check for index.mdx or index.md in the directory
|
||
if (target_path / 'index.mdx').exists():
|
||
return (link, True, str(target_path / 'index.mdx'))
|
||
elif (target_path / 'index.md').exists():
|
||
return (link, True, str(target_path / 'index.md'))
|
||
# Check if the path exists directly
|
||
if target_path.exists():
|
||
return (link, True, str(target_path))
|
||
else:
|
||
return (link, False, f"File not found: {target_path}")
|
||
except Exception as e:
|
||
return (link, False, str(e))
|
||
|
||
def check_link(self, link, file_path):
|
||
"""Check a link based on its type"""
|
||
if self.is_external_link(link):
|
||
return self.check_online_link(link)
|
||
elif self.is_anchor_link(link) or self.is_mail_link(link):
|
||
return (link, None, "Skipped (anchor or mail link)")
|
||
else:
|
||
return self.check_local_path(link, file_path)
|
||
|
||
def check_links_in_file(self, file_path):
|
||
"""Check all links in a file, with line/col info"""
|
||
link_infos = self.extract_links_from_markdown(file_path)
|
||
print(f"Found {len(link_infos)} links in {file_path}")
|
||
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||
future_to_info = {executor.submit(self.check_link, url, file_path): (url, line, col) for url, line, col in link_infos}
|
||
for future in concurrent.futures.as_completed(future_to_info):
|
||
url, line, col = future_to_info[future]
|
||
link, is_valid, status = future.result()
|
||
if is_valid is None:
|
||
self.results["skipped"].append((link, status, file_path, line, col))
|
||
elif is_valid:
|
||
self.results["valid"].append((link, status, file_path, line, col))
|
||
else:
|
||
self.results["invalid"].append((link, status, file_path, line, col))
|
||
|
||
def print_report(self):
|
||
"""Print a colored report of the link check results, with file/line info for invalid links"""
|
||
print("\n" + "="*60)
|
||
print(f"{Fore.CYAN}LINK CHECKER REPORT{Style.RESET_ALL}")
|
||
print("="*60)
|
||
print(f"\n{Fore.GREEN}✅ Valid Links ({len(self.results['valid'])}):{Style.RESET_ALL}")
|
||
for link, status, file_path, line, col in self.results["valid"]:
|
||
print(f" - {link} -> {status} ({file_path}:{line}:{col})")
|
||
print(f"\n{Fore.RED}❌ Invalid Links ({len(self.results['invalid'])}):{Style.RESET_ALL}")
|
||
for link, status, file_path, line, col in self.results["invalid"]:
|
||
# Cursor/VSCode 终端友好格式
|
||
print(f"{file_path}:{line}:{col}: {Fore.RED}无效链接: {link} -> {status}{Style.RESET_ALL}")
|
||
print(f"\n{Fore.YELLOW}⏩ Skipped Links ({len(self.results['skipped'])}):{Style.RESET_ALL}")
|
||
for link, reason, file_path, line, col in self.results["skipped"]:
|
||
print(f" - {link} ({reason}) ({file_path}:{line}:{col})")
|
||
print("\n" + "-"*60)
|
||
print(f"{Fore.CYAN}SUMMARY:{Style.RESET_ALL}")
|
||
print(f"Total links: {len(self.results['valid']) + len(self.results['invalid']) + len(self.results['skipped'])}")
|
||
print(f"{Fore.GREEN}Valid: {len(self.results['valid'])}{Style.RESET_ALL}")
|
||
print(f"{Fore.RED}Invalid: {len(self.results['invalid'])}{Style.RESET_ALL}")
|
||
print(f"{Fore.YELLOW}Skipped: {len(self.results['skipped'])}{Style.RESET_ALL}")
|
||
print("-"*60)
|
||
|
||
def check_links_in_directory(self, directory, file_pattern="*.md*"):
|
||
"""Check links in all markdown files in a directory (md/mdx)"""
|
||
mdx_files = list(Path(directory).glob(f"**/{file_pattern}"))
|
||
print(f"Found {len(mdx_files)} {file_pattern} files in {directory}")
|
||
for file_path in mdx_files:
|
||
print(f"\nChecking {file_path}...")
|
||
self.check_links_in_file(file_path)
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description='Check links in markdown files')
|
||
parser.add_argument('path', nargs='?', default=None, help='Path to the markdown file or directory to check')
|
||
parser.add_argument('--base-dir', help='Base directory for resolving relative paths (default: parent dir of the file)')
|
||
parser.add_argument('--timeout', type=int, default=10, help='Timeout for HTTP requests in seconds (default: 10)')
|
||
parser.add_argument('--workers', type=int, default=10, help='Number of worker threads (default: 10)')
|
||
parser.add_argument('--pattern', default="*.mdx", help='File pattern to match when checking directories (default: *.mdx)')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 新增:如果没有传入 path,则提示输入
|
||
if not args.path:
|
||
args.path = input("请输入要检查的文件或目录路径: ").strip()
|
||
|
||
file_path = Path(args.path)
|
||
if not file_path.exists():
|
||
print(f"{Fore.RED}Error: Path '{args.path}' does not exist{Style.RESET_ALL}")
|
||
return 1
|
||
|
||
# If base_dir is not specified, use parent of parent directory (for typical docs structure)
|
||
if args.base_dir:
|
||
base_dir = args.base_dir
|
||
else:
|
||
# 智能向上查找最近的语言目录(zh-hans、en、ja-jp)
|
||
lang_dirs = ["zh-hans", "en", "ja-jp"]
|
||
current = file_path.resolve()
|
||
found = False
|
||
for parent in [current] + list(current.parents):
|
||
if parent.name in lang_dirs:
|
||
base_dir = str(parent)
|
||
found = True
|
||
break
|
||
if not found:
|
||
base_dir = file_path.parent
|
||
|
||
checker = LinkChecker(
|
||
base_dir=base_dir,
|
||
timeout=args.timeout,
|
||
max_workers=args.workers
|
||
)
|
||
|
||
print(f"Base directory: {base_dir}")
|
||
|
||
if file_path.is_file():
|
||
print(f"Checking links in file: {file_path}")
|
||
checker.check_links_in_file(file_path)
|
||
else:
|
||
print(f"Checking links in directory: {file_path}")
|
||
checker.check_links_in_directory(file_path, args.pattern)
|
||
|
||
checker.print_report()
|
||
|
||
# Return non-zero exit code if invalid links were found
|
||
return 1 if checker.results["invalid"] else 0
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|