Files
dify-docs/scripts/extract-gitbook-url.py
2025-03-21 20:15:22 +08:00

176 lines
5.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
改进的GitBook Summary链接提取器 (支持目录输出)
此脚本从SUMMARY.md文件中提取所有内容
保留原始的目录结构和标题,
将链接转换为在线URL不包含.md后缀
支持将输出文件放在指定目录中。
"""
import os
import re
import sys
import urllib.parse
def process_summary_file(summary_path, base_url):
"""
处理SUMMARY.md文件保留结构并转换链接
Args:
summary_path: SUMMARY.md文件的路径
base_url: 基础URL
Returns:
processed_content: 处理后的内容
"""
print(f"正在处理 {summary_path}...")
try:
with open(summary_path, 'r', encoding='utf-8') as file:
content = file.read()
except Exception as e:
print(f"读取文件时出错: {e}")
sys.exit(1)
# 确保base_url以/结尾
if not base_url.endswith('/'):
base_url += '/'
# 处理每一行
lines = content.split('\n')
processed_lines = []
for line in lines:
# 提取行中的Markdown链接
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
matches = re.findall(link_pattern, line)
processed_line = line
# 替换每个链接
for text, link in matches:
# 跳过锚点链接
if link.startswith('#'):
continue
# 构建完整URL
if not link.startswith(('http://', 'https://')):
if link.startswith('/'):
link = link[1:]
full_url = urllib.parse.urljoin(base_url, link)
else:
full_url = link
# 移除.md后缀
if full_url.endswith('.md'):
full_url = full_url[:-3]
# 替换链接
original_link = f"[{text}]({link})"
new_link = f"[{text}]({full_url})"
processed_line = processed_line.replace(original_link, new_link)
processed_lines.append(processed_line)
return '\n'.join(processed_lines)
def save_to_markdown(content, output_path):
"""
保存处理后的内容到Markdown文件
Args:
content: 处理后的内容
output_path: 输出文件路径
"""
# 检查路径是否是目录
if os.path.isdir(output_path):
# 如果是目录,在该目录中创建默认文件名
output_file = os.path.join(output_path, "gitbook-urls.md")
else:
# 否则使用提供的路径
output_file = output_path
# 确保输出目录存在
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
try:
os.makedirs(output_dir)
print(f"已创建目录: {output_dir}")
except Exception as e:
print(f"创建目录时出错: {e}")
sys.exit(1)
try:
with open(output_file, 'w', encoding='utf-8') as file:
file.write(content)
print(f"Markdown文件已生成: {output_file}")
except Exception as e:
print(f"写入文件时出错: {e}")
sys.exit(1)
def add_header(content):
"""
向内容添加标题和说明
Args:
content: 原始内容
Returns:
new_content: 添加标题和说明后的内容
"""
header = "# GitBook文档链接\n\n"
header += "以下是从SUMMARY.md提取的文档结构和链接\n\n"
return header + content
if __name__ == "__main__":
print("=" * 60)
print("改进的GitBook Summary链接提取器 (支持目录输出)")
print("=" * 60)
# 获取SUMMARY.md文件路径
if len(sys.argv) > 1:
summary_path = sys.argv[1]
else:
summary_path = input("请输入SUMMARY.md文件路径: ").strip()
if not summary_path:
summary_path = os.path.join(os.getcwd(), "SUMMARY.md")
print(f"使用默认路径: {summary_path}")
# 检查文件是否存在
if not os.path.isfile(summary_path):
print(f"错误: 文件 '{summary_path}' 不存在")
sys.exit(1)
# 获取基础URL
if len(sys.argv) > 2:
base_url = sys.argv[2]
else:
base_url = input("请输入文档基础URL: ").strip()
if not base_url:
base_url = "https://docs.example.com/"
print(f"使用默认URL: {base_url}")
# 获取输出文件路径或目录
if len(sys.argv) > 3:
output_path = sys.argv[3]
else:
default_output = os.path.join(os.path.dirname(summary_path), "gitbook-urls.md")
output_path = input(f"请输入输出文件路径或目录 [默认: {default_output}]: ").strip()
if not output_path:
output_path = default_output
# 处理文件内容
processed_content = process_summary_file(summary_path, base_url)
# 添加标题和说明
final_content = add_header(processed_content)
# 保存到Markdown文件
save_to_markdown(final_content, output_path)
print("\n处理完成!")