dify-docs/scripts/extract-local-file-url.py

#!/usr/bin/env python3
"""
本地GitBook Markdown文件链接检查工具

此脚本会:
1. 从SUMMARY.md提取所有文档链接
2. 解析每个本地Markdown文件
3. 提取并验证文件中的内部链接
4. 生成链接检查报告
"""

import os
import re
import sys
import csv
from datetime import datetime
from urllib.parse import urlparse, urljoin

# 尝试导入依赖，如果不存在则自动安装
try:
    from bs4 import BeautifulSoup
    import markdown
except ImportError:
    print("正在安装必要依赖...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "beautifulsoup4", "markdown"])
    from bs4 import BeautifulSoup
    import markdown


class GitbookLocalChecker:
    """GitBook本地文件链接检查工具"""

    def __init__(self, summary_path, base_dir=None, remove_md=True):
        """
        初始化链接检查器

        Args:
            summary_path: SUMMARY.md文件路径
            base_dir: 文档根目录，默认为SUMMARY.md所在目录
            remove_md: 是否移除.md后缀
        """
        self.summary_path = os.path.abspath(summary_path)
        self.base_dir = base_dir or os.path.dirname(self.summary_path)
        self.remove_md = remove_md
        self.all_links = []
        self.all_md_files = []
        self.invalid_links = []

        # 记录解析过的文件，避免重复处理
        self.processed_files = set()

    def extract_summary_links(self):
        """从SUMMARY.md提取所有Markdown文件链接"""
        print(f"正在从 {self.summary_path} 提取文档链接...")

        with open(self.summary_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 使用正则表达式提取链接
        link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
        matches = re.findall(link_pattern, content)

        links = []
        for i, (text, link) in enumerate(matches, 1):
            # 排除锚点链接
            if not link.startswith('#') and link.endswith('.md'):
                # 计算本地文件路径
                local_path = os.path.normpath(os.path.join(self.base_dir, link))

                links.append({
                    'id': i,
                    'text': text,
                    'link': link,
                    'local_path': local_path,
                    'exists': os.path.exists(local_path),
                    'type': 'summary_link',
                    'source_file': 'SUMMARY.md'
                })

                # 将文件添加到待处理列表
                if os.path.exists(local_path):
                    self.all_md_files.append(local_path)

        print(f"找到 {len(links)} 个文档链接，{len(self.all_md_files)} 个本地Markdown文件")
        self.all_links.extend(links)
        return links

    def process_md_file(self, file_path):
        """处理单个Markdown文件，提取其中的链接"""
        # 如果文件已处理，跳过
        if file_path in self.processed_files:
            return []

        self.processed_files.add(file_path)
        relative_path = os.path.relpath(file_path, self.base_dir)

        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # 提取所有链接
            link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
            matches = re.findall(link_pattern, content)

            links = []
            for text, link in matches:
                # 排除外部链接和锚点链接
                if link.startswith(('http://', 'https://', '#')):
                    continue

                # 解析相对路径
                if link.startswith('/'):
                    # 从根目录计算
                    target_path = os.path.normpath(os.path.join(self.base_dir, link.lstrip('/')))
                else:
                    # 从当前文件所在目录计算
                    target_path = os.path.normpath(os.path.join(os.path.dirname(file_path), link))

                # 如果链接没有扩展名但指向目录，添加README.md
                if not os.path.splitext(target_path)[1]:
                    if os.path.isdir(target_path):
                        target_path = os.path.join(target_path, 'README.md')
                    else:
                        # 可能是不带扩展名的文件引用，添加.md
                        target_path += '.md'

                # 检查链接是否有效
                exists = os.path.exists(target_path)

                link_info = {
                    'text': text,
                    'link': link,
                    'local_path': target_path,
                    'target_file': os.path.basename(target_path),
                    'exists': exists,
                    'type': 'internal_link',
                    'source_file': relative_path
                }

                links.append(link_info)

                # 如果链接无效，添加到无效链接列表
                if not exists:
                    self.invalid_links.append(link_info)
                # 如果是有效的Markdown文件且尚未处理，添加到待处理列表
                elif target_path.endswith('.md') and target_path not in self.processed_files:
                    self.all_md_files.append(target_path)

            return links

        except Exception as e:
            print(f"处理文件 {file_path} 时出错: {e}")
            return []

    def process_all_files(self):
        """处理所有Markdown文件"""
        print("开始处理所有Markdown文件...")

        # 先提取SUMMARY.md中的链接
        self.extract_summary_links()

        # 处理所有Markdown文件
        files_to_process = list(self.all_md_files)  # 创建副本，因为处理过程中会添加新文件
        processed_count = 0

        for file_path in files_to_process:
            if file_path not in self.processed_files:
                relative_path = os.path.relpath(file_path, self.base_dir)
                print(f"处理文件: {relative_path}")

                links = self.process_md_file(file_path)
                self.all_links.extend(links)

                processed_count += 1

                # 如果发现新文件，可能需要处理它们
                new_files = [f for f in self.all_md_files if f not in files_to_process and f not in self.processed_files]
                files_to_process.extend(new_files)

        print(f"已处理 {processed_count} 个Markdown文件")
        print(f"共找到 {len(self.all_links)} 个链接，其中 {len(self.invalid_links)} 个无效")

    def generate_markdown_report(self, output_path):
        """生成Markdown格式的报告"""
        print(f"正在生成报告: {output_path}")

        content = f"""# GitBook本地链接检查报告

## 摘要
- 检查时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- 处理文件数: {len(self.processed_files)}
- 总链接数: {len(self.all_links)}
- 无效链接数: {len(self.invalid_links)}

## 无效链接列表
"""

        # 按源文件分组显示无效链接
        grouped_links = {}
        for link in self.invalid_links:
            source = link['source_file']
            if source not in grouped_links:
                grouped_links[source] = []
            grouped_links[source].append(link)

        for source, links in sorted(grouped_links.items()):
            content += f"\n### 文件: {source}\n"
            for link in links:
                content += f"- [{link['text']}]({link['link']}) -> {link['local_path']} (无效)\n"

        # 添加所有文件的链接统计
        content += "\n## 文件链接统计\n"
        file_stats = {}
        for link in self.all_links:
            source = link['source_file']
            if source not in file_stats:
                file_stats[source] = {'total': 0, 'invalid': 0}
            file_stats[source]['total'] += 1
            if not link['exists']:
                file_stats[source]['invalid'] += 1

        for source, stats in sorted(file_stats.items()):
            content += f"- {source}: 共 {stats['total']} 个链接，{stats['invalid']} 个无效\n"

        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(content)

        print(f"报告已生成: {output_path}")

    def generate_csv_report(self, output_path):
        """生成CSV格式的报告"""
        print(f"正在生成CSV报告: {output_path}")

        with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['source_file', 'text', 'link', 'local_path', 'exists', 'type']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for link in self.all_links:
                writer.writerow({
                    'source_file': link['source_file'],
                    'text': link['text'],
                    'link': link['link'],
                    'local_path': link['local_path'],
                    'exists': link['exists'],
                    'type': link['type']
                })

        print(f"CSV报告已生成: {output_path}")


def get_input_with_default(prompt, default=None):
    """获取用户输入，如果为空则使用默认值"""
    if default:
        user_input = input(f"{prompt} [{default}]: ")
        return user_input if user_input.strip() else default
    else:
        return input(f"{prompt}: ")


def get_yes_no_input(prompt, default="y"):
    """获取用户是/否输入"""
    valid_responses = {
        'y': True, 'yes': True, '是': True,
        'n': False, 'no': False, '否': False
    }

    if default.lower() in ['y', 'yes', '是']:
        prompt = f"{prompt} [Y/n]: "
        default_value = True
    else:
        prompt = f"{prompt} [y/N]: "
        default_value = False

    user_input = input(prompt).lower()

    if not user_input:
        return default_value

    return valid_responses.get(user_input, default_value)


def main():
    """主函数，交互式获取输入"""
    print("=" * 60)
    print("本地GitBook Markdown文件链接检查工具")
    print("=" * 60)

    # 获取SUMMARY.md文件路径
    while True:
        summary_path = get_input_with_default(
            "请输入SUMMARY.md文件路径",
            os.path.join(os.getcwd(), "SUMMARY.md")
        )

        # 检查文件是否存在
        if os.path.isfile(summary_path):
            break
        else:
            print(f"错误: 文件 '{summary_path}' 不存在")

    # 获取文档根目录
    default_base_dir = os.path.dirname(os.path.abspath(summary_path))
    base_dir = get_input_with_default(
        "请输入文档根目录(包含所有Markdown文件的目录)",
        default_base_dir
    )

    # 获取输出目录
    output_dir = get_input_with_default(
        "请输入输出目录",
        os.path.dirname(summary_path) or os.getcwd()
    )

    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)

    # 生成文件路径
    report_path = os.path.join(output_dir, "gitbook-links-report.md")
    csv_path = os.path.join(output_dir, "gitbook-links-report.csv")

    # 询问是否移除.md后缀
    remove_md = get_yes_no_input("是否移除链接中的.md后缀", "y")

    try:
        # 创建检查器实例
        checker = GitbookLocalChecker(
            summary_path=summary_path,
            base_dir=base_dir,
            remove_md=remove_md
        )

        # 处理所有文件
        checker.process_all_files()

        # 生成报告
        checker.generate_markdown_report(report_path)
        checker.generate_csv_report(csv_path)

        print("\n检查完成！")
        print(f"Markdown报告: {report_path}")
        print(f"CSV报告: {csv_path}")

        # 显示摘要
        print(f"\n摘要:")
        print(f"- 处理文件数: {len(checker.processed_files)}")
        print(f"- 总链接数: {len(checker.all_links)}")
        print(f"- 无效链接数: {len(checker.invalid_links)}")

        if checker.invalid_links:
            print("\n无效链接示例:")
            for i, link in enumerate(checker.invalid_links[:5], 1):
                print(f"{i}. 文件 '{link['source_file']}' 中 [{link['text']}]({link['link']}) -> {link['local_path']} (无效)")

            if len(checker.invalid_links) > 5:
                print(f"... 以及其他 {len(checker.invalid_links) - 5} 个无效链接")

    except Exception as e:
        print(f"执行过程中出错: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()