mirror of
https://github.com/langgenius/dify-docs.git
synced 2026-03-26 13:18:34 +07:00
757 lines
27 KiB
Python
757 lines
27 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
多线程版GitBook链接检查器
|
||
|
||
此脚本使用多线程并行检查在线链接,大幅提高检查速度。
|
||
生成两个报告文件:
|
||
1. 包含所有链接的完整报告
|
||
2. 仅包含错误链接的报告
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import sys
|
||
import time
|
||
import threading
|
||
import queue
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from collections import defaultdict
|
||
from urllib.parse import urlparse
|
||
|
||
try:
|
||
import requests
|
||
from requests.exceptions import RequestException
|
||
except ImportError:
|
||
print("正在安装requests库...")
|
||
import subprocess
|
||
subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])
|
||
import requests
|
||
from requests.exceptions import RequestException
|
||
|
||
class LinkChecker:
|
||
def __init__(self, summary_path, base_dir=None, verify_online=True, max_threads=10):
|
||
"""
|
||
初始化链接检查器
|
||
|
||
Args:
|
||
summary_path: SUMMARY.md文件路径
|
||
base_dir: 文档根目录,默认为SUMMARY.md所在目录
|
||
verify_online: 是否验证在线链接
|
||
max_threads: 最大线程数
|
||
"""
|
||
self.summary_path = os.path.abspath(summary_path)
|
||
self.base_dir = base_dir or os.path.dirname(self.summary_path)
|
||
self.verify_online = verify_online
|
||
self.max_threads = max_threads
|
||
self.summary_links = [] # SUMMARY.md中的链接
|
||
self.md_links = defaultdict(list) # 每个文档中引用的链接
|
||
self.processed_files = set() # 已处理的文件
|
||
self.summary_content = "" # SUMMARY.md的内容
|
||
self.invalid_links = [] # 存储所有无效链接
|
||
|
||
# 图片文件扩展名
|
||
self.image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.bmp', '.tiff', '.webp')
|
||
|
||
# 在线链接缓存,避免重复检查
|
||
self.online_link_cache = {}
|
||
self.online_link_cache_lock = threading.Lock() # 线程安全的缓存锁
|
||
|
||
# 用于存储待检查的在线链接
|
||
self.online_links_queue = queue.Queue()
|
||
|
||
# 进度统计
|
||
self.total_online_links = 0
|
||
self.checked_online_links = 0
|
||
self.progress_lock = threading.Lock()
|
||
|
||
def is_image_link(self, link):
|
||
"""
|
||
检查链接是否为图片链接
|
||
|
||
Args:
|
||
link: 链接路径
|
||
|
||
Returns:
|
||
is_image: 是否为图片链接
|
||
"""
|
||
return link.lower().endswith(self.image_extensions)
|
||
|
||
def check_online_link(self, url):
|
||
"""
|
||
检查在线链接是否有效
|
||
|
||
Args:
|
||
url: 在线链接URL
|
||
|
||
Returns:
|
||
is_valid: 链接是否有效
|
||
"""
|
||
# 如果已经检查过,直接返回缓存结果
|
||
with self.online_link_cache_lock:
|
||
if url in self.online_link_cache:
|
||
return self.online_link_cache[url]
|
||
|
||
if not self.verify_online:
|
||
# 如果不验证在线链接,默认返回无效
|
||
with self.online_link_cache_lock:
|
||
self.online_link_cache[url] = False
|
||
return False
|
||
|
||
try:
|
||
# 先尝试HEAD请求,速度更快
|
||
response = requests.head(
|
||
url,
|
||
timeout=5,
|
||
allow_redirects=True,
|
||
headers={'User-Agent': 'Mozilla/5.0 GitBook-Link-Checker/1.0'}
|
||
)
|
||
|
||
if response.status_code < 400:
|
||
# 状态码小于400,认为链接有效
|
||
with self.online_link_cache_lock:
|
||
self.online_link_cache[url] = True
|
||
return True
|
||
|
||
# HEAD请求失败,尝试GET请求
|
||
response = requests.get(
|
||
url,
|
||
timeout=5,
|
||
allow_redirects=True,
|
||
headers={'User-Agent': 'Mozilla/5.0 GitBook-Link-Checker/1.0'}
|
||
)
|
||
|
||
result = response.status_code < 400
|
||
with self.online_link_cache_lock:
|
||
self.online_link_cache[url] = result
|
||
return result
|
||
|
||
except RequestException:
|
||
# 请求异常,链接无效
|
||
with self.online_link_cache_lock:
|
||
self.online_link_cache[url] = False
|
||
return False
|
||
|
||
def resolve_path(self, link, current_dir):
|
||
"""
|
||
解析链接的实际路径
|
||
|
||
Args:
|
||
link: 链接路径
|
||
current_dir: 当前文件所在目录
|
||
|
||
Returns:
|
||
resolved_path: 解析后的路径
|
||
is_external: 是否为外部链接
|
||
is_valid: 链接是否有效
|
||
"""
|
||
if not link:
|
||
return None, False, False
|
||
|
||
# 处理锚点链接
|
||
if '#' in link:
|
||
link_part = link.split('#')[0]
|
||
if not link_part: # 如果只有锚点,没有路径部分
|
||
return None, False, True # 假设内部锚点是有效的
|
||
link = link_part
|
||
|
||
# 检查是否为图片链接
|
||
if self.is_image_link(link):
|
||
return None, False, True # 跳过图片链接,并假设它们是有效的
|
||
|
||
# 处理外部链接
|
||
if link.startswith(('http://', 'https://', 'mailto:', 'tel:')):
|
||
# 如果是http/https链接,加入待检查队列
|
||
if link.startswith(('http://', 'https://')) and self.verify_online:
|
||
# 将链接添加到待检查队列
|
||
self.online_links_queue.put(link)
|
||
with self.progress_lock:
|
||
self.total_online_links += 1
|
||
|
||
# 暂时返回未知状态,后续会更新
|
||
return link, True, None
|
||
elif link.startswith(('http://', 'https://')) and not self.verify_online:
|
||
# 如果不验证在线链接,标记为错误
|
||
return link, True, False
|
||
else:
|
||
# mailto和tel链接默认有效
|
||
return link, True, True
|
||
|
||
# 处理绝对路径 (从文档根目录开始)
|
||
if link.startswith('/'):
|
||
resolved_path = os.path.normpath(os.path.join(self.base_dir, link.lstrip('/')))
|
||
# 处理相对路径 (从当前文件所在目录开始)
|
||
else:
|
||
resolved_path = os.path.normpath(os.path.join(current_dir, link))
|
||
|
||
# 处理目录链接
|
||
if os.path.isdir(resolved_path):
|
||
readme_path = os.path.join(resolved_path, 'README.md')
|
||
if os.path.exists(readme_path):
|
||
return readme_path, False, True
|
||
index_path = os.path.join(resolved_path, 'index.md')
|
||
if os.path.exists(index_path):
|
||
return index_path, False, True
|
||
# 如果没有README.md或index.md,保持原样
|
||
return resolved_path, False, os.path.exists(resolved_path)
|
||
|
||
# 处理不带扩展名的文件引用
|
||
if not os.path.exists(resolved_path) and '.' not in os.path.basename(resolved_path):
|
||
md_path = f"{resolved_path}.md"
|
||
if os.path.exists(md_path):
|
||
return md_path, False, True
|
||
|
||
return resolved_path, False, os.path.exists(resolved_path)
|
||
|
||
def online_link_worker(self):
|
||
"""工作线程:处理在线链接检查"""
|
||
while True:
|
||
try:
|
||
# 从队列获取链接
|
||
url = self.online_links_queue.get(block=False)
|
||
|
||
# 检查链接
|
||
is_valid = self.check_online_link(url)
|
||
|
||
# 更新进度
|
||
with self.progress_lock:
|
||
self.checked_online_links += 1
|
||
checked = self.checked_online_links
|
||
total = self.total_online_links
|
||
|
||
# 显示进度
|
||
print(f"在线链接检查进度: [{checked}/{total}] - {url} - {'✅' if is_valid else '❌'}")
|
||
|
||
# 标记任务完成
|
||
self.online_links_queue.task_done()
|
||
except queue.Empty:
|
||
# 队列为空,退出线程
|
||
break
|
||
|
||
def extract_sections_from_summary(self):
|
||
"""
|
||
从SUMMARY.md提取所有章节信息
|
||
|
||
Returns:
|
||
sections: 章节列表
|
||
"""
|
||
print(f"从 {self.summary_path} 提取章节信息...")
|
||
|
||
try:
|
||
with open(self.summary_path, 'r', encoding='utf-8') as file:
|
||
self.summary_content = file.read()
|
||
except Exception as e:
|
||
print(f"读取文件时出错: {e}")
|
||
sys.exit(1)
|
||
|
||
# 提取所有章节标题
|
||
sections = []
|
||
section_pattern = r'^#+\s+(.*?)(?:\s+<a.*?>)?$'
|
||
|
||
for line in self.summary_content.split('\n'):
|
||
match = re.match(section_pattern, line)
|
||
if match:
|
||
section_title = match.group(1).strip()
|
||
sections.append(section_title)
|
||
|
||
return sections
|
||
|
||
def extract_links_from_summary(self):
|
||
"""
|
||
从SUMMARY.md提取所有链接及其层级结构
|
||
|
||
Returns:
|
||
links: 链接列表,每项包含链接信息和层级
|
||
"""
|
||
print(f"从 {self.summary_path} 提取链接...")
|
||
|
||
# 记录当前所在章节
|
||
current_section = ""
|
||
sections = self.extract_sections_from_summary()
|
||
|
||
# 按行处理SUMMARY文件
|
||
links = []
|
||
|
||
for line in self.summary_content.split('\n'):
|
||
# 检查是否是章节标题行
|
||
section_match = re.match(r'^#+\s+(.*?)(?:\s+<a.*?>)?$', line)
|
||
if section_match:
|
||
current_section = section_match.group(1).strip()
|
||
continue
|
||
|
||
# 检查缩进级别
|
||
indent_match = re.match(r'^(\s*)\*', line)
|
||
if not indent_match:
|
||
continue
|
||
|
||
indent = indent_match.group(1)
|
||
level = len(indent) // 2 # 假设每级缩进是2个空格
|
||
|
||
# 提取链接
|
||
link_match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', line)
|
||
if not link_match:
|
||
continue
|
||
|
||
text, link = link_match.groups()
|
||
|
||
# 跳过只有锚点的链接
|
||
if link.startswith('#'):
|
||
continue
|
||
|
||
# 解析实际文件路径
|
||
file_path, is_external, is_valid = self.resolve_path(link, self.base_dir)
|
||
|
||
# 添加链接
|
||
link_info = {
|
||
'text': text,
|
||
'link': link,
|
||
'file_path': file_path,
|
||
'exists': is_valid,
|
||
'level': level,
|
||
'section': current_section,
|
||
'is_external': is_external,
|
||
'children': [], # 用于存储子链接
|
||
'source_file': 'SUMMARY.md'
|
||
}
|
||
|
||
links.append(link_info)
|
||
|
||
# 如果链接无效,添加到无效链接列表
|
||
if is_valid is False: # 注意:is_valid可能为None(在线链接待检查)
|
||
self.invalid_links.append(link_info)
|
||
|
||
# 构建层级结构
|
||
root_links = []
|
||
level_stack = [None] # 用于跟踪每个级别的最后一个链接
|
||
|
||
for link in links:
|
||
level = link['level']
|
||
|
||
# 调整栈以匹配当前级别
|
||
while len(level_stack) > level + 1:
|
||
level_stack.pop()
|
||
|
||
# 扩展栈以匹配当前级别
|
||
while len(level_stack) < level + 1:
|
||
level_stack.append(None)
|
||
|
||
if level == 0:
|
||
# 顶级链接
|
||
root_links.append(link)
|
||
else:
|
||
# 子链接,添加到父链接的children列表中
|
||
parent = level_stack[level - 1]
|
||
if parent:
|
||
parent['children'].append(link)
|
||
|
||
# 更新当前级别的最后一个链接
|
||
level_stack[level] = link
|
||
|
||
self.summary_links = root_links
|
||
return links
|
||
|
||
def extract_links_from_markdown(self, file_path):
|
||
"""
|
||
从Markdown文件中提取链接
|
||
|
||
Args:
|
||
file_path: Markdown文件路径
|
||
|
||
Returns:
|
||
links: 提取的链接列表
|
||
"""
|
||
if not file_path or file_path in self.processed_files:
|
||
return []
|
||
|
||
if not os.path.exists(file_path) or not file_path.endswith('.md'):
|
||
return []
|
||
|
||
self.processed_files.add(file_path)
|
||
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
content = file.read()
|
||
except Exception as e:
|
||
print(f"读取文件 {file_path} 时出错: {e}")
|
||
return []
|
||
|
||
# 提取链接
|
||
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
||
matches = re.findall(link_pattern, content)
|
||
|
||
links = []
|
||
current_dir = os.path.dirname(file_path)
|
||
relative_source_path = os.path.relpath(file_path, self.base_dir)
|
||
|
||
for text, link in matches:
|
||
# 检查是否为图片链接
|
||
if self.is_image_link(link):
|
||
continue
|
||
|
||
# 解析链接
|
||
resolved_path, is_external, is_valid = self.resolve_path(link, current_dir)
|
||
|
||
# 添加链接
|
||
link_info = {
|
||
'text': text,
|
||
'link': link,
|
||
'file_path': resolved_path,
|
||
'exists': is_valid,
|
||
'is_external': is_external,
|
||
'source_file': relative_source_path
|
||
}
|
||
|
||
links.append(link_info)
|
||
|
||
# 存储到字典中,以文件路径为键
|
||
if file_path not in self.md_links:
|
||
self.md_links[file_path] = []
|
||
self.md_links[file_path].append(link_info)
|
||
|
||
# 如果链接无效,添加到无效链接列表
|
||
if is_valid is False: # 注意:is_valid可能为None(在线链接待检查)
|
||
self.invalid_links.append(link_info)
|
||
|
||
return links
|
||
|
||
def check_links(self):
|
||
"""
|
||
递归检查所有链接
|
||
"""
|
||
# 提取SUMMARY中的链接
|
||
self.extract_links_from_summary()
|
||
|
||
# 递归处理每个链接
|
||
def process_link(link):
|
||
if not link.get('is_external') and link.get('exists') and link.get('file_path') and link.get('file_path').endswith('.md'):
|
||
try:
|
||
relative_path = os.path.relpath(link['file_path'], self.base_dir)
|
||
print(f"检查文件: {relative_path}")
|
||
self.extract_links_from_markdown(link['file_path'])
|
||
except Exception as e:
|
||
print(f"处理文件 {link.get('file_path')} 时出错: {e}")
|
||
|
||
# 递归处理子链接
|
||
for child in link.get('children', []):
|
||
process_link(child)
|
||
|
||
# 处理所有顶级链接
|
||
for link in self.summary_links:
|
||
process_link(link)
|
||
|
||
# 如果需要验证在线链接,启动多线程进行检查
|
||
if self.verify_online and self.total_online_links > 0:
|
||
self.check_online_links_with_threads()
|
||
|
||
# 更新链接状态
|
||
self.update_link_statuses()
|
||
|
||
def check_online_links_with_threads(self):
|
||
"""使用多线程检查在线链接"""
|
||
print(f"\n开始使用多线程检查在线链接,共有 {self.total_online_links} 个链接...")
|
||
|
||
# 创建线程池
|
||
num_threads = min(self.max_threads, self.total_online_links)
|
||
|
||
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||
# 提交任务
|
||
futures = [executor.submit(self.online_link_worker) for _ in range(num_threads)]
|
||
|
||
# 等待队列任务完成
|
||
self.online_links_queue.join()
|
||
|
||
print(f"所有在线链接检查完成,共 {self.total_online_links} 个")
|
||
|
||
def update_link_statuses(self):
|
||
"""根据检查结果更新链接状态"""
|
||
# 更新所有链接的有效性状态
|
||
def update_link(link):
|
||
if link.get('is_external') and link.get('file_path') and link.get('file_path').startswith(('http://', 'https://')):
|
||
with self.online_link_cache_lock:
|
||
is_valid = self.online_link_cache.get(link['file_path'], False)
|
||
|
||
link['exists'] = is_valid
|
||
|
||
# 如果链接无效,添加到无效链接列表
|
||
if not is_valid and link not in self.invalid_links:
|
||
self.invalid_links.append(link)
|
||
|
||
# 递归处理子链接
|
||
for child in link.get('children', []):
|
||
update_link(child)
|
||
|
||
# 处理所有顶级链接
|
||
for link in self.summary_links:
|
||
update_link(link)
|
||
|
||
# 更新文档链接字典
|
||
for file_path, links in self.md_links.items():
|
||
for link in links:
|
||
if link.get('is_external') and link.get('file_path') and link.get('file_path').startswith(('http://', 'https://')):
|
||
with self.online_link_cache_lock:
|
||
is_valid = self.online_link_cache.get(link['file_path'], False)
|
||
|
||
link['exists'] = is_valid
|
||
|
||
# 如果链接无效,添加到无效链接列表
|
||
if not is_valid and link not in self.invalid_links:
|
||
self.invalid_links.append(link)
|
||
|
||
def generate_reports(self, output_path):
|
||
"""
|
||
生成两个报告:完整报告和错误链接报告
|
||
|
||
Args:
|
||
output_path: 完整报告输出文件路径
|
||
"""
|
||
# 生成完整报告
|
||
self.generate_full_report(output_path)
|
||
|
||
# 生成错误链接报告
|
||
error_report_path = output_path.replace('.md', '-error.md')
|
||
if output_path == error_report_path:
|
||
error_report_path = os.path.splitext(output_path)[0] + '-error.md'
|
||
|
||
self.generate_error_report(error_report_path)
|
||
|
||
def generate_full_report(self, output_path):
|
||
"""
|
||
生成包含所有链接的完整报告
|
||
|
||
Args:
|
||
output_path: 输出文件路径
|
||
"""
|
||
content = "# GitBook链接检查报告(完整版)\n\n"
|
||
|
||
# 添加章节标题说明
|
||
content += "本报告显示了GitBook文档中的所有链接及其引用的文档。每行的格式为:\n"
|
||
content += "* [文档标题](文档链接) | [引用的文档1](链接1) | [引用的文档2](链接2) | ...\n\n"
|
||
|
||
# 跟踪已处理的章节
|
||
processed_sections = set()
|
||
|
||
# 递归生成报告内容
|
||
def generate_link_report(link, indent=""):
|
||
nonlocal content
|
||
|
||
# 检查是否有新章节
|
||
if 'section' in link and link['section'] and link['section'] not in processed_sections:
|
||
content += f"\n## {link['section']}\n\n"
|
||
processed_sections.add(link['section'])
|
||
|
||
# 生成主链接
|
||
file_path = link.get('file_path')
|
||
status = "✅" if link.get('exists', False) else "❌"
|
||
|
||
# 基本链接信息
|
||
content += f"{indent}* [{link['text']}]({link['link']}) {status}"
|
||
|
||
# 添加该文档中引用的所有非图片链接
|
||
if file_path and file_path in self.md_links and self.md_links[file_path]:
|
||
referenced_links = self.md_links[file_path]
|
||
|
||
# 遍历文档中引用的所有链接
|
||
for ref_link in referenced_links:
|
||
# 跳过图片链接
|
||
if 'link' in ref_link and self.is_image_link(ref_link['link']):
|
||
continue
|
||
|
||
ref_status = "✅" if ref_link.get('exists', False) else "❌"
|
||
content += f" | [{ref_link['text']}]({ref_link['link']}) {ref_status}"
|
||
|
||
content += "\n"
|
||
|
||
# 递归处理子链接
|
||
for child in link.get('children', []):
|
||
generate_link_report(child, indent + " ")
|
||
|
||
# 处理所有顶级链接
|
||
for link in self.summary_links:
|
||
generate_link_report(link)
|
||
|
||
# 保存报告
|
||
try:
|
||
# 确保输出目录存在
|
||
output_dir = os.path.dirname(output_path)
|
||
if output_dir and not os.path.exists(output_dir):
|
||
os.makedirs(output_dir)
|
||
|
||
with open(output_path, 'w', encoding='utf-8') as file:
|
||
file.write(content)
|
||
|
||
print(f"完整报告已生成: {output_path}")
|
||
except Exception as e:
|
||
print(f"写入报告时出错: {e}")
|
||
|
||
def generate_error_report(self, output_path):
|
||
"""
|
||
生成仅包含错误链接的报告
|
||
|
||
Args:
|
||
output_path: 输出文件路径
|
||
"""
|
||
if not self.invalid_links:
|
||
print(f"没有发现无效链接,不生成错误报告")
|
||
return
|
||
|
||
content = "# GitBook链接检查报告(仅错误链接)\n\n"
|
||
content += "本报告仅显示文档中的无效链接。每行的格式为:\n"
|
||
content += "* [文档标题](文档链接) | [无效链接](链接路径) ❌\n\n"
|
||
|
||
# 按源文件组织无效链接
|
||
links_by_source = defaultdict(list)
|
||
|
||
for link in self.invalid_links:
|
||
source = link.get('source_file', 'Unknown')
|
||
links_by_source[source].append(link)
|
||
|
||
# 按源文件添加无效链接
|
||
for source, links in sorted(links_by_source.items()):
|
||
# 添加源文件标题
|
||
content += f"## 来自 {source}\n\n"
|
||
|
||
# 找到源文件在summary中的对应链接
|
||
summary_link = None
|
||
|
||
# 查找源文件对应的summary链接
|
||
for link in self.extract_links_from_summary():
|
||
if link.get('file_path') and os.path.relpath(link['file_path'], self.base_dir) == source:
|
||
summary_link = link
|
||
break
|
||
|
||
# 如果是SUMMARY.md本身
|
||
if source == 'SUMMARY.md':
|
||
# 添加每个无效链接
|
||
for link in links:
|
||
status = "❌"
|
||
content += f"* [{link['text']}]({link['link']}) {status}\n"
|
||
else:
|
||
# 如果找到了源文件对应的summary链接
|
||
if summary_link:
|
||
# 显示源文件链接和其中的无效链接
|
||
source_status = "✅" if summary_link.get('exists', False) else "❌"
|
||
content += f"* [{summary_link['text']}]({summary_link['link']}) {source_status}"
|
||
|
||
# 添加源文件中的无效链接
|
||
for link in links:
|
||
content += f" | [{link['text']}]({link['link']}) ❌"
|
||
|
||
content += "\n\n"
|
||
else:
|
||
# 没有找到源文件对应的summary链接,只显示无效链接
|
||
for link in links:
|
||
content += f"* 来自: {source} - [{link['text']}]({link['link']}) ❌\n"
|
||
|
||
content += "\n"
|
||
|
||
# 保存报告
|
||
try:
|
||
# 确保输出目录存在
|
||
output_dir = os.path.dirname(output_path)
|
||
if output_dir and not os.path.exists(output_dir):
|
||
os.makedirs(output_dir)
|
||
|
||
with open(output_path, 'w', encoding='utf-8') as file:
|
||
file.write(content)
|
||
|
||
print(f"错误报告已生成: {output_path}")
|
||
except Exception as e:
|
||
print(f"写入错误报告时出错: {e}")
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
print("=" * 60)
|
||
print("多线程版GitBook链接检查器")
|
||
print("=" * 60)
|
||
|
||
# 获取SUMMARY.md文件路径
|
||
if len(sys.argv) > 1:
|
||
summary_path = sys.argv[1]
|
||
else:
|
||
summary_path = input("请输入SUMMARY.md文件路径: ").strip()
|
||
if not summary_path:
|
||
summary_path = os.path.join(os.getcwd(), "SUMMARY.md")
|
||
print(f"使用默认路径: {summary_path}")
|
||
|
||
# 检查文件是否存在
|
||
if not os.path.isfile(summary_path):
|
||
print(f"错误: 文件 '{summary_path}' 不存在")
|
||
sys.exit(1)
|
||
|
||
# 获取基础目录
|
||
base_dir = os.path.dirname(os.path.abspath(summary_path))
|
||
if len(sys.argv) > 2:
|
||
base_dir = sys.argv[2]
|
||
else:
|
||
input_base_dir = input(f"请输入文档根目录 [默认: {base_dir}]: ").strip()
|
||
if input_base_dir:
|
||
base_dir = input_base_dir
|
||
|
||
# 获取输出文件路径
|
||
if len(sys.argv) > 3:
|
||
output_path = sys.argv[3]
|
||
else:
|
||
default_output = os.path.join(base_dir, "link-check-report.md")
|
||
output_path = input(f"请输入输出文件路径 [默认: {default_output}]: ").strip()
|
||
if not output_path:
|
||
output_path = default_output
|
||
|
||
# 处理目录输出
|
||
if os.path.isdir(output_path):
|
||
output_path = os.path.join(output_path, "link-check-report.md")
|
||
|
||
# 询问是否验证在线链接
|
||
verify_online = input("是否验证在线链接? (y/n) [默认: n]: ").strip().lower() == 'y'
|
||
|
||
max_threads = 10
|
||
if verify_online:
|
||
# 获取最大线程数
|
||
try:
|
||
max_threads = int(input(f"请输入最大线程数 [默认: 10]: ").strip() or "10")
|
||
if max_threads < 1:
|
||
max_threads = 10
|
||
print(f"线程数必须大于0,已设置为默认值10")
|
||
except ValueError:
|
||
max_threads = 10
|
||
print(f"输入无效,已设置为默认值10")
|
||
|
||
print(f"将使用 {max_threads} 个线程并行检查在线链接")
|
||
else:
|
||
print("未验证的在线链接将被标记为错误,并添加到错误报告中")
|
||
|
||
start_time = time.time()
|
||
|
||
try:
|
||
# 创建链接检查器并执行检查
|
||
checker = LinkChecker(
|
||
summary_path=summary_path,
|
||
base_dir=base_dir,
|
||
verify_online=verify_online,
|
||
max_threads=max_threads
|
||
)
|
||
|
||
checker.check_links()
|
||
checker.generate_reports(output_path)
|
||
|
||
# 统计信息
|
||
total_files = len(checker.processed_files)
|
||
invalid_links = len(checker.invalid_links)
|
||
|
||
end_time = time.time()
|
||
elapsed_time = end_time - start_time
|
||
|
||
print(f"\n统计信息:")
|
||
print(f"- 检查的文件数: {total_files}")
|
||
print(f"- 无效链接数: {invalid_links}")
|
||
print(f"- 耗时: {elapsed_time:.2f} 秒")
|
||
|
||
print("\n检查完成!")
|
||
except Exception as e:
|
||
print(f"执行过程中出错: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |