dify-docs/scripts/auto-url-check.py

#!/usr/bin/env python3
"""
多线程版GitBook链接检查器

此脚本使用多线程并行检查在线链接，大幅提高检查速度。
生成两个报告文件：
1. 包含所有链接的完整报告
2. 仅包含错误链接的报告
"""

import os
import re
import sys
import time
import threading
import queue
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
from urllib.parse import urlparse

try:
    import requests
    from requests.exceptions import RequestException
except ImportError:
    print("正在安装requests库...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])
    import requests
    from requests.exceptions import RequestException

class LinkChecker:
    def __init__(self, summary_path, base_dir=None, verify_online=True, max_threads=10):
        """
        初始化链接检查器

        Args:
            summary_path: SUMMARY.md文件路径
            base_dir: 文档根目录，默认为SUMMARY.md所在目录
            verify_online: 是否验证在线链接
            max_threads: 最大线程数
        """
        self.summary_path = os.path.abspath(summary_path)
        self.base_dir = base_dir or os.path.dirname(self.summary_path)
        self.verify_online = verify_online
        self.max_threads = max_threads
        self.summary_links = []  # SUMMARY.md中的链接
        self.md_links = defaultdict(list)  # 每个文档中引用的链接
        self.processed_files = set()  # 已处理的文件
        self.summary_content = ""  # SUMMARY.md的内容
        self.invalid_links = []  # 存储所有无效链接

        # 图片文件扩展名
        self.image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.bmp', '.tiff', '.webp')

        # 在线链接缓存，避免重复检查
        self.online_link_cache = {}
        self.online_link_cache_lock = threading.Lock()  # 线程安全的缓存锁

        # 用于存储待检查的在线链接
        self.online_links_queue = queue.Queue()

        # 进度统计
        self.total_online_links = 0
        self.checked_online_links = 0
        self.progress_lock = threading.Lock()

    def is_image_link(self, link):
        """
        检查链接是否为图片链接

        Args:
            link: 链接路径

        Returns:
            is_image: 是否为图片链接
        """
        return link.lower().endswith(self.image_extensions)

    def check_online_link(self, url):
        """
        检查在线链接是否有效

        Args:
            url: 在线链接URL

        Returns:
            is_valid: 链接是否有效
        """
        # 如果已经检查过，直接返回缓存结果
        with self.online_link_cache_lock:
            if url in self.online_link_cache:
                return self.online_link_cache[url]

        if not self.verify_online:
            # 如果不验证在线链接，默认返回无效
            with self.online_link_cache_lock:
                self.online_link_cache[url] = False
            return False

        try:
            # 先尝试HEAD请求，速度更快
            response = requests.head(
                url,
                timeout=5,
                allow_redirects=True,
                headers={'User-Agent': 'Mozilla/5.0 GitBook-Link-Checker/1.0'}
            )

            if response.status_code < 400:
                # 状态码小于400，认为链接有效
                with self.online_link_cache_lock:
                    self.online_link_cache[url] = True
                return True

            # HEAD请求失败，尝试GET请求
            response = requests.get(
                url,
                timeout=5,
                allow_redirects=True,
                headers={'User-Agent': 'Mozilla/5.0 GitBook-Link-Checker/1.0'}
            )

            result = response.status_code < 400
            with self.online_link_cache_lock:
                self.online_link_cache[url] = result
            return result

        except RequestException:
            # 请求异常，链接无效
            with self.online_link_cache_lock:
                self.online_link_cache[url] = False
            return False

    def resolve_path(self, link, current_dir):
        """
        解析链接的实际路径

        Args:
            link: 链接路径
            current_dir: 当前文件所在目录

        Returns:
            resolved_path: 解析后的路径
            is_external: 是否为外部链接
            is_valid: 链接是否有效
        """
        if not link:
            return None, False, False

        # 处理锚点链接
        if '#' in link:
            link_part = link.split('#')[0]
            if not link_part:  # 如果只有锚点，没有路径部分
                return None, False, True  # 假设内部锚点是有效的
            link = link_part

        # 检查是否为图片链接
        if self.is_image_link(link):
            return None, False, True  # 跳过图片链接，并假设它们是有效的

        # 处理外部链接
        if link.startswith(('http://', 'https://', 'mailto:', 'tel:')):
            # 如果是http/https链接，加入待检查队列
            if link.startswith(('http://', 'https://')) and self.verify_online:
                # 将链接添加到待检查队列
                self.online_links_queue.put(link)
                with self.progress_lock:
                    self.total_online_links += 1

                # 暂时返回未知状态，后续会更新
                return link, True, None
            elif link.startswith(('http://', 'https://')) and not self.verify_online:
                # 如果不验证在线链接，标记为错误
                return link, True, False
            else:
                # mailto和tel链接默认有效
                return link, True, True

        # 处理绝对路径 (从文档根目录开始)
        if link.startswith('/'):
            resolved_path = os.path.normpath(os.path.join(self.base_dir, link.lstrip('/')))
        # 处理相对路径 (从当前文件所在目录开始)
        else:
            resolved_path = os.path.normpath(os.path.join(current_dir, link))

        # 处理目录链接
        if os.path.isdir(resolved_path):
            readme_path = os.path.join(resolved_path, 'README.md')
            if os.path.exists(readme_path):
                return readme_path, False, True
            index_path = os.path.join(resolved_path, 'index.md')
            if os.path.exists(index_path):
                return index_path, False, True
            # 如果没有README.md或index.md，保持原样
            return resolved_path, False, os.path.exists(resolved_path)

        # 处理不带扩展名的文件引用
        if not os.path.exists(resolved_path) and '.' not in os.path.basename(resolved_path):
            md_path = f"{resolved_path}.md"
            if os.path.exists(md_path):
                return md_path, False, True

        return resolved_path, False, os.path.exists(resolved_path)

    def online_link_worker(self):
        """工作线程：处理在线链接检查"""
        while True:
            try:
                # 从队列获取链接
                url = self.online_links_queue.get(block=False)

                # 检查链接
                is_valid = self.check_online_link(url)

                # 更新进度
                with self.progress_lock:
                    self.checked_online_links += 1
                    checked = self.checked_online_links
                    total = self.total_online_links

                # 显示进度
                print(f"在线链接检查进度: [{checked}/{total}] - {url} - {'✅' if is_valid else '❌'}")

                # 标记任务完成
                self.online_links_queue.task_done()
            except queue.Empty:
                # 队列为空，退出线程
                break

    def extract_sections_from_summary(self):
        """
        从SUMMARY.md提取所有章节信息

        Returns:
            sections: 章节列表
        """
        print(f"从 {self.summary_path} 提取章节信息...")

        try:
            with open(self.summary_path, 'r', encoding='utf-8') as file:
                self.summary_content = file.read()
        except Exception as e:
            print(f"读取文件时出错: {e}")
            sys.exit(1)

        # 提取所有章节标题
        sections = []
        section_pattern = r'^#+\s+(.*?)(?:\s+<a.*?>)?$'

        for line in self.summary_content.split('\n'):
            match = re.match(section_pattern, line)
            if match:
                section_title = match.group(1).strip()
                sections.append(section_title)

        return sections

    def extract_links_from_summary(self):
        """
        从SUMMARY.md提取所有链接及其层级结构

        Returns:
            links: 链接列表，每项包含链接信息和层级
        """
        print(f"从 {self.summary_path} 提取链接...")

        # 记录当前所在章节
        current_section = ""
        sections = self.extract_sections_from_summary()

        # 按行处理SUMMARY文件
        links = []

        for line in self.summary_content.split('\n'):
            # 检查是否是章节标题行
            section_match = re.match(r'^#+\s+(.*?)(?:\s+<a.*?>)?$', line)
            if section_match:
                current_section = section_match.group(1).strip()
                continue

            # 检查缩进级别
            indent_match = re.match(r'^(\s*)\*', line)
            if not indent_match:
                continue

            indent = indent_match.group(1)
            level = len(indent) // 2  # 假设每级缩进是2个空格

            # 提取链接
            link_match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', line)
            if not link_match:
                continue

            text, link = link_match.groups()

            # 跳过只有锚点的链接
            if link.startswith('#'):
                continue

            # 解析实际文件路径
            file_path, is_external, is_valid = self.resolve_path(link, self.base_dir)

            # 添加链接
            link_info = {
                'text': text,
                'link': link,
                'file_path': file_path,
                'exists': is_valid,
                'level': level,
                'section': current_section,
                'is_external': is_external,
                'children': [],  # 用于存储子链接
                'source_file': 'SUMMARY.md'
            }

            links.append(link_info)

            # 如果链接无效，添加到无效链接列表
            if is_valid is False:  # 注意：is_valid可能为None（在线链接待检查）
                self.invalid_links.append(link_info)

        # 构建层级结构
        root_links = []
        level_stack = [None]  # 用于跟踪每个级别的最后一个链接

        for link in links:
            level = link['level']

            # 调整栈以匹配当前级别
            while len(level_stack) > level + 1:
                level_stack.pop()

            # 扩展栈以匹配当前级别
            while len(level_stack) < level + 1:
                level_stack.append(None)

            if level == 0:
                # 顶级链接
                root_links.append(link)
            else:
                # 子链接，添加到父链接的children列表中
                parent = level_stack[level - 1]
                if parent:
                    parent['children'].append(link)

            # 更新当前级别的最后一个链接
            level_stack[level] = link

        self.summary_links = root_links
        return links

    def extract_links_from_markdown(self, file_path):
        """
        从Markdown文件中提取链接

        Args:
            file_path: Markdown文件路径

        Returns:
            links: 提取的链接列表
        """
        if not file_path or file_path in self.processed_files:
            return []

        if not os.path.exists(file_path) or not file_path.endswith('.md'):
            return []

        self.processed_files.add(file_path)

        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
        except Exception as e:
            print(f"读取文件 {file_path} 时出错: {e}")
            return []

        # 提取链接
        link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
        matches = re.findall(link_pattern, content)

        links = []
        current_dir = os.path.dirname(file_path)
        relative_source_path = os.path.relpath(file_path, self.base_dir)

        for text, link in matches:
            # 检查是否为图片链接
            if self.is_image_link(link):
                continue

            # 解析链接
            resolved_path, is_external, is_valid = self.resolve_path(link, current_dir)

            # 添加链接
            link_info = {
                'text': text,
                'link': link,
                'file_path': resolved_path,
                'exists': is_valid,
                'is_external': is_external,
                'source_file': relative_source_path
            }

            links.append(link_info)

            # 存储到字典中，以文件路径为键
            if file_path not in self.md_links:
                self.md_links[file_path] = []
            self.md_links[file_path].append(link_info)

            # 如果链接无效，添加到无效链接列表
            if is_valid is False:  # 注意：is_valid可能为None（在线链接待检查）
                self.invalid_links.append(link_info)

        return links

    def check_links(self):
        """
        递归检查所有链接
        """
        # 提取SUMMARY中的链接
        self.extract_links_from_summary()

        # 递归处理每个链接
        def process_link(link):
            if not link.get('is_external') and link.get('exists') and link.get('file_path') and link.get('file_path').endswith('.md'):
                try:
                    relative_path = os.path.relpath(link['file_path'], self.base_dir)
                    print(f"检查文件: {relative_path}")
                    self.extract_links_from_markdown(link['file_path'])
                except Exception as e:
                    print(f"处理文件 {link.get('file_path')} 时出错: {e}")

            # 递归处理子链接
            for child in link.get('children', []):
                process_link(child)

        # 处理所有顶级链接
        for link in self.summary_links:
            process_link(link)

        # 如果需要验证在线链接，启动多线程进行检查
        if self.verify_online and self.total_online_links > 0:
            self.check_online_links_with_threads()

            # 更新链接状态
            self.update_link_statuses()

    def check_online_links_with_threads(self):
        """使用多线程检查在线链接"""
        print(f"\n开始使用多线程检查在线链接，共有 {self.total_online_links} 个链接...")

        # 创建线程池
        num_threads = min(self.max_threads, self.total_online_links)

        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            # 提交任务
            futures = [executor.submit(self.online_link_worker) for _ in range(num_threads)]

            # 等待队列任务完成
            self.online_links_queue.join()

            print(f"所有在线链接检查完成，共 {self.total_online_links} 个")

    def update_link_statuses(self):
        """根据检查结果更新链接状态"""
        # 更新所有链接的有效性状态
        def update_link(link):
            if link.get('is_external') and link.get('file_path') and link.get('file_path').startswith(('http://', 'https://')):
                with self.online_link_cache_lock:
                    is_valid = self.online_link_cache.get(link['file_path'], False)

                link['exists'] = is_valid

                # 如果链接无效，添加到无效链接列表
                if not is_valid and link not in self.invalid_links:
                    self.invalid_links.append(link)

            # 递归处理子链接
            for child in link.get('children', []):
                update_link(child)

        # 处理所有顶级链接
        for link in self.summary_links:
            update_link(link)

        # 更新文档链接字典
        for file_path, links in self.md_links.items():
            for link in links:
                if link.get('is_external') and link.get('file_path') and link.get('file_path').startswith(('http://', 'https://')):
                    with self.online_link_cache_lock:
                        is_valid = self.online_link_cache.get(link['file_path'], False)

                    link['exists'] = is_valid

                    # 如果链接无效，添加到无效链接列表
                    if not is_valid and link not in self.invalid_links:
                        self.invalid_links.append(link)

    def generate_reports(self, output_path):
        """
        生成两个报告：完整报告和错误链接报告

        Args:
            output_path: 完整报告输出文件路径
        """
        # 生成完整报告
        self.generate_full_report(output_path)

        # 生成错误链接报告
        error_report_path = output_path.replace('.md', '-error.md')
        if output_path == error_report_path:
            error_report_path = os.path.splitext(output_path)[0] + '-error.md'

        self.generate_error_report(error_report_path)

    def generate_full_report(self, output_path):
        """
        生成包含所有链接的完整报告

        Args:
            output_path: 输出文件路径
        """
        content = "# GitBook链接检查报告（完整版）\n\n"

        # 添加章节标题说明
        content += "本报告显示了GitBook文档中的所有链接及其引用的文档。每行的格式为：\n"
        content += "* [文档标题](文档链接) | [引用的文档1](链接1) | [引用的文档2](链接2) | ...\n\n"

        # 跟踪已处理的章节
        processed_sections = set()

        # 递归生成报告内容
        def generate_link_report(link, indent=""):
            nonlocal content

            # 检查是否有新章节
            if 'section' in link and link['section'] and link['section'] not in processed_sections:
                content += f"\n## {link['section']}\n\n"
                processed_sections.add(link['section'])

            # 生成主链接
            file_path = link.get('file_path')
            status = "✅" if link.get('exists', False) else "❌"

            # 基本链接信息
            content += f"{indent}* [{link['text']}]({link['link']}) {status}"

            # 添加该文档中引用的所有非图片链接
            if file_path and file_path in self.md_links and self.md_links[file_path]:
                referenced_links = self.md_links[file_path]

                # 遍历文档中引用的所有链接
                for ref_link in referenced_links:
                    # 跳过图片链接
                    if 'link' in ref_link and self.is_image_link(ref_link['link']):
                        continue

                    ref_status = "✅" if ref_link.get('exists', False) else "❌"
                    content += f" | [{ref_link['text']}]({ref_link['link']}) {ref_status}"

            content += "\n"

            # 递归处理子链接
            for child in link.get('children', []):
                generate_link_report(child, indent + "  ")

        # 处理所有顶级链接
        for link in self.summary_links:
            generate_link_report(link)

        # 保存报告
        try:
            # 确保输出目录存在
            output_dir = os.path.dirname(output_path)
            if output_dir and not os.path.exists(output_dir):
                os.makedirs(output_dir)

            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(content)

            print(f"完整报告已生成: {output_path}")
        except Exception as e:
            print(f"写入报告时出错: {e}")

    def generate_error_report(self, output_path):
        """
        生成仅包含错误链接的报告

        Args:
            output_path: 输出文件路径
        """
        if not self.invalid_links:
            print(f"没有发现无效链接，不生成错误报告")
            return

        content = "# GitBook链接检查报告（仅错误链接）\n\n"
        content += "本报告仅显示文档中的无效链接。每行的格式为：\n"
        content += "* [文档标题](文档链接) | [无效链接](链接路径) ❌\n\n"

        # 按源文件组织无效链接
        links_by_source = defaultdict(list)

        for link in self.invalid_links:
            source = link.get('source_file', 'Unknown')
            links_by_source[source].append(link)

        # 按源文件添加无效链接
        for source, links in sorted(links_by_source.items()):
            # 添加源文件标题
            content += f"## 来自 {source}\n\n"

            # 找到源文件在summary中的对应链接
            summary_link = None

            # 查找源文件对应的summary链接
            for link in self.extract_links_from_summary():
                if link.get('file_path') and os.path.relpath(link['file_path'], self.base_dir) == source:
                    summary_link = link
                    break

            # 如果是SUMMARY.md本身
            if source == 'SUMMARY.md':
                # 添加每个无效链接
                for link in links:
                    status = "❌"
                    content += f"* [{link['text']}]({link['link']}) {status}\n"
            else:
                # 如果找到了源文件对应的summary链接
                if summary_link:
                    # 显示源文件链接和其中的无效链接
                    source_status = "✅" if summary_link.get('exists', False) else "❌"
                    content += f"* [{summary_link['text']}]({summary_link['link']}) {source_status}"

                    # 添加源文件中的无效链接
                    for link in links:
                        content += f" | [{link['text']}]({link['link']}) ❌"

                    content += "\n\n"
                else:
                    # 没有找到源文件对应的summary链接，只显示无效链接
                    for link in links:
                        content += f"* 来自: {source} - [{link['text']}]({link['link']}) ❌\n"

                    content += "\n"

        # 保存报告
        try:
            # 确保输出目录存在
            output_dir = os.path.dirname(output_path)
            if output_dir and not os.path.exists(output_dir):
                os.makedirs(output_dir)

            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(content)

            print(f"错误报告已生成: {output_path}")
        except Exception as e:
            print(f"写入错误报告时出错: {e}")


def main():
    """主函数"""
    print("=" * 60)
    print("多线程版GitBook链接检查器")
    print("=" * 60)

    # 获取SUMMARY.md文件路径
    if len(sys.argv) > 1:
        summary_path = sys.argv[1]
    else:
        summary_path = input("请输入SUMMARY.md文件路径: ").strip()
        if not summary_path:
            summary_path = os.path.join(os.getcwd(), "SUMMARY.md")
            print(f"使用默认路径: {summary_path}")

    # 检查文件是否存在
    if not os.path.isfile(summary_path):
        print(f"错误: 文件 '{summary_path}' 不存在")
        sys.exit(1)

    # 获取基础目录
    base_dir = os.path.dirname(os.path.abspath(summary_path))
    if len(sys.argv) > 2:
        base_dir = sys.argv[2]
    else:
        input_base_dir = input(f"请输入文档根目录 [默认: {base_dir}]: ").strip()
        if input_base_dir:
            base_dir = input_base_dir

    # 获取输出文件路径
    if len(sys.argv) > 3:
        output_path = sys.argv[3]
    else:
        default_output = os.path.join(base_dir, "link-check-report.md")
        output_path = input(f"请输入输出文件路径 [默认: {default_output}]: ").strip()
        if not output_path:
            output_path = default_output

    # 处理目录输出
    if os.path.isdir(output_path):
        output_path = os.path.join(output_path, "link-check-report.md")

    # 询问是否验证在线链接
    verify_online = input("是否验证在线链接? (y/n) [默认: n]: ").strip().lower() == 'y'

    max_threads = 10
    if verify_online:
        # 获取最大线程数
        try:
            max_threads = int(input(f"请输入最大线程数 [默认: 10]: ").strip() or "10")
            if max_threads < 1:
                max_threads = 10
                print(f"线程数必须大于0，已设置为默认值10")
        except ValueError:
            max_threads = 10
            print(f"输入无效，已设置为默认值10")

        print(f"将使用 {max_threads} 个线程并行检查在线链接")
    else:
        print("未验证的在线链接将被标记为错误，并添加到错误报告中")

    start_time = time.time()

    try:
        # 创建链接检查器并执行检查
        checker = LinkChecker(
            summary_path=summary_path,
            base_dir=base_dir,
            verify_online=verify_online,
            max_threads=max_threads
        )

        checker.check_links()
        checker.generate_reports(output_path)

        # 统计信息
        total_files = len(checker.processed_files)
        invalid_links = len(checker.invalid_links)

        end_time = time.time()
        elapsed_time = end_time - start_time

        print(f"\n统计信息:")
        print(f"- 检查的文件数: {total_files}")
        print(f"- 无效链接数: {invalid_links}")
        print(f"- 耗时: {elapsed_time:.2f} 秒")

        print("\n检查完成！")
    except Exception as e:
        print(f"执行过程中出错: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()