mirror of
https://github.com/langgenius/dify-docs.git
synced 2026-03-27 13:28:32 +07:00
Feat: update jp docs
This commit is contained in:
757
scripts/auto-url-check.py
Normal file
757
scripts/auto-url-check.py
Normal file
@@ -0,0 +1,757 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
多线程版GitBook链接检查器
|
||||
|
||||
此脚本使用多线程并行检查在线链接,大幅提高检查速度。
|
||||
生成两个报告文件:
|
||||
1. 包含所有链接的完整报告
|
||||
2. 仅包含错误链接的报告
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import threading
|
||||
import queue
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from collections import defaultdict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
except ImportError:
|
||||
print("正在安装requests库...")
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
class LinkChecker:
|
||||
def __init__(self, summary_path, base_dir=None, verify_online=True, max_threads=10):
|
||||
"""
|
||||
初始化链接检查器
|
||||
|
||||
Args:
|
||||
summary_path: SUMMARY.md文件路径
|
||||
base_dir: 文档根目录,默认为SUMMARY.md所在目录
|
||||
verify_online: 是否验证在线链接
|
||||
max_threads: 最大线程数
|
||||
"""
|
||||
self.summary_path = os.path.abspath(summary_path)
|
||||
self.base_dir = base_dir or os.path.dirname(self.summary_path)
|
||||
self.verify_online = verify_online
|
||||
self.max_threads = max_threads
|
||||
self.summary_links = [] # SUMMARY.md中的链接
|
||||
self.md_links = defaultdict(list) # 每个文档中引用的链接
|
||||
self.processed_files = set() # 已处理的文件
|
||||
self.summary_content = "" # SUMMARY.md的内容
|
||||
self.invalid_links = [] # 存储所有无效链接
|
||||
|
||||
# 图片文件扩展名
|
||||
self.image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.bmp', '.tiff', '.webp')
|
||||
|
||||
# 在线链接缓存,避免重复检查
|
||||
self.online_link_cache = {}
|
||||
self.online_link_cache_lock = threading.Lock() # 线程安全的缓存锁
|
||||
|
||||
# 用于存储待检查的在线链接
|
||||
self.online_links_queue = queue.Queue()
|
||||
|
||||
# 进度统计
|
||||
self.total_online_links = 0
|
||||
self.checked_online_links = 0
|
||||
self.progress_lock = threading.Lock()
|
||||
|
||||
def is_image_link(self, link):
|
||||
"""
|
||||
检查链接是否为图片链接
|
||||
|
||||
Args:
|
||||
link: 链接路径
|
||||
|
||||
Returns:
|
||||
is_image: 是否为图片链接
|
||||
"""
|
||||
return link.lower().endswith(self.image_extensions)
|
||||
|
||||
def check_online_link(self, url):
|
||||
"""
|
||||
检查在线链接是否有效
|
||||
|
||||
Args:
|
||||
url: 在线链接URL
|
||||
|
||||
Returns:
|
||||
is_valid: 链接是否有效
|
||||
"""
|
||||
# 如果已经检查过,直接返回缓存结果
|
||||
with self.online_link_cache_lock:
|
||||
if url in self.online_link_cache:
|
||||
return self.online_link_cache[url]
|
||||
|
||||
if not self.verify_online:
|
||||
# 如果不验证在线链接,默认返回无效
|
||||
with self.online_link_cache_lock:
|
||||
self.online_link_cache[url] = False
|
||||
return False
|
||||
|
||||
try:
|
||||
# 先尝试HEAD请求,速度更快
|
||||
response = requests.head(
|
||||
url,
|
||||
timeout=5,
|
||||
allow_redirects=True,
|
||||
headers={'User-Agent': 'Mozilla/5.0 GitBook-Link-Checker/1.0'}
|
||||
)
|
||||
|
||||
if response.status_code < 400:
|
||||
# 状态码小于400,认为链接有效
|
||||
with self.online_link_cache_lock:
|
||||
self.online_link_cache[url] = True
|
||||
return True
|
||||
|
||||
# HEAD请求失败,尝试GET请求
|
||||
response = requests.get(
|
||||
url,
|
||||
timeout=5,
|
||||
allow_redirects=True,
|
||||
headers={'User-Agent': 'Mozilla/5.0 GitBook-Link-Checker/1.0'}
|
||||
)
|
||||
|
||||
result = response.status_code < 400
|
||||
with self.online_link_cache_lock:
|
||||
self.online_link_cache[url] = result
|
||||
return result
|
||||
|
||||
except RequestException:
|
||||
# 请求异常,链接无效
|
||||
with self.online_link_cache_lock:
|
||||
self.online_link_cache[url] = False
|
||||
return False
|
||||
|
||||
def resolve_path(self, link, current_dir):
|
||||
"""
|
||||
解析链接的实际路径
|
||||
|
||||
Args:
|
||||
link: 链接路径
|
||||
current_dir: 当前文件所在目录
|
||||
|
||||
Returns:
|
||||
resolved_path: 解析后的路径
|
||||
is_external: 是否为外部链接
|
||||
is_valid: 链接是否有效
|
||||
"""
|
||||
if not link:
|
||||
return None, False, False
|
||||
|
||||
# 处理锚点链接
|
||||
if '#' in link:
|
||||
link_part = link.split('#')[0]
|
||||
if not link_part: # 如果只有锚点,没有路径部分
|
||||
return None, False, True # 假设内部锚点是有效的
|
||||
link = link_part
|
||||
|
||||
# 检查是否为图片链接
|
||||
if self.is_image_link(link):
|
||||
return None, False, True # 跳过图片链接,并假设它们是有效的
|
||||
|
||||
# 处理外部链接
|
||||
if link.startswith(('http://', 'https://', 'mailto:', 'tel:')):
|
||||
# 如果是http/https链接,加入待检查队列
|
||||
if link.startswith(('http://', 'https://')) and self.verify_online:
|
||||
# 将链接添加到待检查队列
|
||||
self.online_links_queue.put(link)
|
||||
with self.progress_lock:
|
||||
self.total_online_links += 1
|
||||
|
||||
# 暂时返回未知状态,后续会更新
|
||||
return link, True, None
|
||||
elif link.startswith(('http://', 'https://')) and not self.verify_online:
|
||||
# 如果不验证在线链接,标记为错误
|
||||
return link, True, False
|
||||
else:
|
||||
# mailto和tel链接默认有效
|
||||
return link, True, True
|
||||
|
||||
# 处理绝对路径 (从文档根目录开始)
|
||||
if link.startswith('/'):
|
||||
resolved_path = os.path.normpath(os.path.join(self.base_dir, link.lstrip('/')))
|
||||
# 处理相对路径 (从当前文件所在目录开始)
|
||||
else:
|
||||
resolved_path = os.path.normpath(os.path.join(current_dir, link))
|
||||
|
||||
# 处理目录链接
|
||||
if os.path.isdir(resolved_path):
|
||||
readme_path = os.path.join(resolved_path, 'README.md')
|
||||
if os.path.exists(readme_path):
|
||||
return readme_path, False, True
|
||||
index_path = os.path.join(resolved_path, 'index.md')
|
||||
if os.path.exists(index_path):
|
||||
return index_path, False, True
|
||||
# 如果没有README.md或index.md,保持原样
|
||||
return resolved_path, False, os.path.exists(resolved_path)
|
||||
|
||||
# 处理不带扩展名的文件引用
|
||||
if not os.path.exists(resolved_path) and '.' not in os.path.basename(resolved_path):
|
||||
md_path = f"{resolved_path}.md"
|
||||
if os.path.exists(md_path):
|
||||
return md_path, False, True
|
||||
|
||||
return resolved_path, False, os.path.exists(resolved_path)
|
||||
|
||||
def online_link_worker(self):
|
||||
"""工作线程:处理在线链接检查"""
|
||||
while True:
|
||||
try:
|
||||
# 从队列获取链接
|
||||
url = self.online_links_queue.get(block=False)
|
||||
|
||||
# 检查链接
|
||||
is_valid = self.check_online_link(url)
|
||||
|
||||
# 更新进度
|
||||
with self.progress_lock:
|
||||
self.checked_online_links += 1
|
||||
checked = self.checked_online_links
|
||||
total = self.total_online_links
|
||||
|
||||
# 显示进度
|
||||
print(f"在线链接检查进度: [{checked}/{total}] - {url} - {'✅' if is_valid else '❌'}")
|
||||
|
||||
# 标记任务完成
|
||||
self.online_links_queue.task_done()
|
||||
except queue.Empty:
|
||||
# 队列为空,退出线程
|
||||
break
|
||||
|
||||
def extract_sections_from_summary(self):
|
||||
"""
|
||||
从SUMMARY.md提取所有章节信息
|
||||
|
||||
Returns:
|
||||
sections: 章节列表
|
||||
"""
|
||||
print(f"从 {self.summary_path} 提取章节信息...")
|
||||
|
||||
try:
|
||||
with open(self.summary_path, 'r', encoding='utf-8') as file:
|
||||
self.summary_content = file.read()
|
||||
except Exception as e:
|
||||
print(f"读取文件时出错: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# 提取所有章节标题
|
||||
sections = []
|
||||
section_pattern = r'^#+\s+(.*?)(?:\s+<a.*?>)?$'
|
||||
|
||||
for line in self.summary_content.split('\n'):
|
||||
match = re.match(section_pattern, line)
|
||||
if match:
|
||||
section_title = match.group(1).strip()
|
||||
sections.append(section_title)
|
||||
|
||||
return sections
|
||||
|
||||
def extract_links_from_summary(self):
|
||||
"""
|
||||
从SUMMARY.md提取所有链接及其层级结构
|
||||
|
||||
Returns:
|
||||
links: 链接列表,每项包含链接信息和层级
|
||||
"""
|
||||
print(f"从 {self.summary_path} 提取链接...")
|
||||
|
||||
# 记录当前所在章节
|
||||
current_section = ""
|
||||
sections = self.extract_sections_from_summary()
|
||||
|
||||
# 按行处理SUMMARY文件
|
||||
links = []
|
||||
|
||||
for line in self.summary_content.split('\n'):
|
||||
# 检查是否是章节标题行
|
||||
section_match = re.match(r'^#+\s+(.*?)(?:\s+<a.*?>)?$', line)
|
||||
if section_match:
|
||||
current_section = section_match.group(1).strip()
|
||||
continue
|
||||
|
||||
# 检查缩进级别
|
||||
indent_match = re.match(r'^(\s*)\*', line)
|
||||
if not indent_match:
|
||||
continue
|
||||
|
||||
indent = indent_match.group(1)
|
||||
level = len(indent) // 2 # 假设每级缩进是2个空格
|
||||
|
||||
# 提取链接
|
||||
link_match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', line)
|
||||
if not link_match:
|
||||
continue
|
||||
|
||||
text, link = link_match.groups()
|
||||
|
||||
# 跳过只有锚点的链接
|
||||
if link.startswith('#'):
|
||||
continue
|
||||
|
||||
# 解析实际文件路径
|
||||
file_path, is_external, is_valid = self.resolve_path(link, self.base_dir)
|
||||
|
||||
# 添加链接
|
||||
link_info = {
|
||||
'text': text,
|
||||
'link': link,
|
||||
'file_path': file_path,
|
||||
'exists': is_valid,
|
||||
'level': level,
|
||||
'section': current_section,
|
||||
'is_external': is_external,
|
||||
'children': [], # 用于存储子链接
|
||||
'source_file': 'SUMMARY.md'
|
||||
}
|
||||
|
||||
links.append(link_info)
|
||||
|
||||
# 如果链接无效,添加到无效链接列表
|
||||
if is_valid is False: # 注意:is_valid可能为None(在线链接待检查)
|
||||
self.invalid_links.append(link_info)
|
||||
|
||||
# 构建层级结构
|
||||
root_links = []
|
||||
level_stack = [None] # 用于跟踪每个级别的最后一个链接
|
||||
|
||||
for link in links:
|
||||
level = link['level']
|
||||
|
||||
# 调整栈以匹配当前级别
|
||||
while len(level_stack) > level + 1:
|
||||
level_stack.pop()
|
||||
|
||||
# 扩展栈以匹配当前级别
|
||||
while len(level_stack) < level + 1:
|
||||
level_stack.append(None)
|
||||
|
||||
if level == 0:
|
||||
# 顶级链接
|
||||
root_links.append(link)
|
||||
else:
|
||||
# 子链接,添加到父链接的children列表中
|
||||
parent = level_stack[level - 1]
|
||||
if parent:
|
||||
parent['children'].append(link)
|
||||
|
||||
# 更新当前级别的最后一个链接
|
||||
level_stack[level] = link
|
||||
|
||||
self.summary_links = root_links
|
||||
return links
|
||||
|
||||
def extract_links_from_markdown(self, file_path):
|
||||
"""
|
||||
从Markdown文件中提取链接
|
||||
|
||||
Args:
|
||||
file_path: Markdown文件路径
|
||||
|
||||
Returns:
|
||||
links: 提取的链接列表
|
||||
"""
|
||||
if not file_path or file_path in self.processed_files:
|
||||
return []
|
||||
|
||||
if not os.path.exists(file_path) or not file_path.endswith('.md'):
|
||||
return []
|
||||
|
||||
self.processed_files.add(file_path)
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
except Exception as e:
|
||||
print(f"读取文件 {file_path} 时出错: {e}")
|
||||
return []
|
||||
|
||||
# 提取链接
|
||||
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
||||
matches = re.findall(link_pattern, content)
|
||||
|
||||
links = []
|
||||
current_dir = os.path.dirname(file_path)
|
||||
relative_source_path = os.path.relpath(file_path, self.base_dir)
|
||||
|
||||
for text, link in matches:
|
||||
# 检查是否为图片链接
|
||||
if self.is_image_link(link):
|
||||
continue
|
||||
|
||||
# 解析链接
|
||||
resolved_path, is_external, is_valid = self.resolve_path(link, current_dir)
|
||||
|
||||
# 添加链接
|
||||
link_info = {
|
||||
'text': text,
|
||||
'link': link,
|
||||
'file_path': resolved_path,
|
||||
'exists': is_valid,
|
||||
'is_external': is_external,
|
||||
'source_file': relative_source_path
|
||||
}
|
||||
|
||||
links.append(link_info)
|
||||
|
||||
# 存储到字典中,以文件路径为键
|
||||
if file_path not in self.md_links:
|
||||
self.md_links[file_path] = []
|
||||
self.md_links[file_path].append(link_info)
|
||||
|
||||
# 如果链接无效,添加到无效链接列表
|
||||
if is_valid is False: # 注意:is_valid可能为None(在线链接待检查)
|
||||
self.invalid_links.append(link_info)
|
||||
|
||||
return links
|
||||
|
||||
def check_links(self):
|
||||
"""
|
||||
递归检查所有链接
|
||||
"""
|
||||
# 提取SUMMARY中的链接
|
||||
self.extract_links_from_summary()
|
||||
|
||||
# 递归处理每个链接
|
||||
def process_link(link):
|
||||
if not link.get('is_external') and link.get('exists') and link.get('file_path') and link.get('file_path').endswith('.md'):
|
||||
try:
|
||||
relative_path = os.path.relpath(link['file_path'], self.base_dir)
|
||||
print(f"检查文件: {relative_path}")
|
||||
self.extract_links_from_markdown(link['file_path'])
|
||||
except Exception as e:
|
||||
print(f"处理文件 {link.get('file_path')} 时出错: {e}")
|
||||
|
||||
# 递归处理子链接
|
||||
for child in link.get('children', []):
|
||||
process_link(child)
|
||||
|
||||
# 处理所有顶级链接
|
||||
for link in self.summary_links:
|
||||
process_link(link)
|
||||
|
||||
# 如果需要验证在线链接,启动多线程进行检查
|
||||
if self.verify_online and self.total_online_links > 0:
|
||||
self.check_online_links_with_threads()
|
||||
|
||||
# 更新链接状态
|
||||
self.update_link_statuses()
|
||||
|
||||
def check_online_links_with_threads(self):
|
||||
"""使用多线程检查在线链接"""
|
||||
print(f"\n开始使用多线程检查在线链接,共有 {self.total_online_links} 个链接...")
|
||||
|
||||
# 创建线程池
|
||||
num_threads = min(self.max_threads, self.total_online_links)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
# 提交任务
|
||||
futures = [executor.submit(self.online_link_worker) for _ in range(num_threads)]
|
||||
|
||||
# 等待队列任务完成
|
||||
self.online_links_queue.join()
|
||||
|
||||
print(f"所有在线链接检查完成,共 {self.total_online_links} 个")
|
||||
|
||||
def update_link_statuses(self):
|
||||
"""根据检查结果更新链接状态"""
|
||||
# 更新所有链接的有效性状态
|
||||
def update_link(link):
|
||||
if link.get('is_external') and link.get('file_path') and link.get('file_path').startswith(('http://', 'https://')):
|
||||
with self.online_link_cache_lock:
|
||||
is_valid = self.online_link_cache.get(link['file_path'], False)
|
||||
|
||||
link['exists'] = is_valid
|
||||
|
||||
# 如果链接无效,添加到无效链接列表
|
||||
if not is_valid and link not in self.invalid_links:
|
||||
self.invalid_links.append(link)
|
||||
|
||||
# 递归处理子链接
|
||||
for child in link.get('children', []):
|
||||
update_link(child)
|
||||
|
||||
# 处理所有顶级链接
|
||||
for link in self.summary_links:
|
||||
update_link(link)
|
||||
|
||||
# 更新文档链接字典
|
||||
for file_path, links in self.md_links.items():
|
||||
for link in links:
|
||||
if link.get('is_external') and link.get('file_path') and link.get('file_path').startswith(('http://', 'https://')):
|
||||
with self.online_link_cache_lock:
|
||||
is_valid = self.online_link_cache.get(link['file_path'], False)
|
||||
|
||||
link['exists'] = is_valid
|
||||
|
||||
# 如果链接无效,添加到无效链接列表
|
||||
if not is_valid and link not in self.invalid_links:
|
||||
self.invalid_links.append(link)
|
||||
|
||||
def generate_reports(self, output_path):
|
||||
"""
|
||||
生成两个报告:完整报告和错误链接报告
|
||||
|
||||
Args:
|
||||
output_path: 完整报告输出文件路径
|
||||
"""
|
||||
# 生成完整报告
|
||||
self.generate_full_report(output_path)
|
||||
|
||||
# 生成错误链接报告
|
||||
error_report_path = output_path.replace('.md', '-error.md')
|
||||
if output_path == error_report_path:
|
||||
error_report_path = os.path.splitext(output_path)[0] + '-error.md'
|
||||
|
||||
self.generate_error_report(error_report_path)
|
||||
|
||||
def generate_full_report(self, output_path):
|
||||
"""
|
||||
生成包含所有链接的完整报告
|
||||
|
||||
Args:
|
||||
output_path: 输出文件路径
|
||||
"""
|
||||
content = "# GitBook链接检查报告(完整版)\n\n"
|
||||
|
||||
# 添加章节标题说明
|
||||
content += "本报告显示了GitBook文档中的所有链接及其引用的文档。每行的格式为:\n"
|
||||
content += "* [文档标题](文档链接) | [引用的文档1](链接1) | [引用的文档2](链接2) | ...\n\n"
|
||||
|
||||
# 跟踪已处理的章节
|
||||
processed_sections = set()
|
||||
|
||||
# 递归生成报告内容
|
||||
def generate_link_report(link, indent=""):
|
||||
nonlocal content
|
||||
|
||||
# 检查是否有新章节
|
||||
if 'section' in link and link['section'] and link['section'] not in processed_sections:
|
||||
content += f"\n## {link['section']}\n\n"
|
||||
processed_sections.add(link['section'])
|
||||
|
||||
# 生成主链接
|
||||
file_path = link.get('file_path')
|
||||
status = "✅" if link.get('exists', False) else "❌"
|
||||
|
||||
# 基本链接信息
|
||||
content += f"{indent}* [{link['text']}]({link['link']}) {status}"
|
||||
|
||||
# 添加该文档中引用的所有非图片链接
|
||||
if file_path and file_path in self.md_links and self.md_links[file_path]:
|
||||
referenced_links = self.md_links[file_path]
|
||||
|
||||
# 遍历文档中引用的所有链接
|
||||
for ref_link in referenced_links:
|
||||
# 跳过图片链接
|
||||
if 'link' in ref_link and self.is_image_link(ref_link['link']):
|
||||
continue
|
||||
|
||||
ref_status = "✅" if ref_link.get('exists', False) else "❌"
|
||||
content += f" | [{ref_link['text']}]({ref_link['link']}) {ref_status}"
|
||||
|
||||
content += "\n"
|
||||
|
||||
# 递归处理子链接
|
||||
for child in link.get('children', []):
|
||||
generate_link_report(child, indent + " ")
|
||||
|
||||
# 处理所有顶级链接
|
||||
for link in self.summary_links:
|
||||
generate_link_report(link)
|
||||
|
||||
# 保存报告
|
||||
try:
|
||||
# 确保输出目录存在
|
||||
output_dir = os.path.dirname(output_path)
|
||||
if output_dir and not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
|
||||
print(f"完整报告已生成: {output_path}")
|
||||
except Exception as e:
|
||||
print(f"写入报告时出错: {e}")
|
||||
|
||||
def generate_error_report(self, output_path):
|
||||
"""
|
||||
生成仅包含错误链接的报告
|
||||
|
||||
Args:
|
||||
output_path: 输出文件路径
|
||||
"""
|
||||
if not self.invalid_links:
|
||||
print(f"没有发现无效链接,不生成错误报告")
|
||||
return
|
||||
|
||||
content = "# GitBook链接检查报告(仅错误链接)\n\n"
|
||||
content += "本报告仅显示文档中的无效链接。每行的格式为:\n"
|
||||
content += "* [文档标题](文档链接) | [无效链接](链接路径) ❌\n\n"
|
||||
|
||||
# 按源文件组织无效链接
|
||||
links_by_source = defaultdict(list)
|
||||
|
||||
for link in self.invalid_links:
|
||||
source = link.get('source_file', 'Unknown')
|
||||
links_by_source[source].append(link)
|
||||
|
||||
# 按源文件添加无效链接
|
||||
for source, links in sorted(links_by_source.items()):
|
||||
# 添加源文件标题
|
||||
content += f"## 来自 {source}\n\n"
|
||||
|
||||
# 找到源文件在summary中的对应链接
|
||||
summary_link = None
|
||||
|
||||
# 查找源文件对应的summary链接
|
||||
for link in self.extract_links_from_summary():
|
||||
if link.get('file_path') and os.path.relpath(link['file_path'], self.base_dir) == source:
|
||||
summary_link = link
|
||||
break
|
||||
|
||||
# 如果是SUMMARY.md本身
|
||||
if source == 'SUMMARY.md':
|
||||
# 添加每个无效链接
|
||||
for link in links:
|
||||
status = "❌"
|
||||
content += f"* [{link['text']}]({link['link']}) {status}\n"
|
||||
else:
|
||||
# 如果找到了源文件对应的summary链接
|
||||
if summary_link:
|
||||
# 显示源文件链接和其中的无效链接
|
||||
source_status = "✅" if summary_link.get('exists', False) else "❌"
|
||||
content += f"* [{summary_link['text']}]({summary_link['link']}) {source_status}"
|
||||
|
||||
# 添加源文件中的无效链接
|
||||
for link in links:
|
||||
content += f" | [{link['text']}]({link['link']}) ❌"
|
||||
|
||||
content += "\n\n"
|
||||
else:
|
||||
# 没有找到源文件对应的summary链接,只显示无效链接
|
||||
for link in links:
|
||||
content += f"* 来自: {source} - [{link['text']}]({link['link']}) ❌\n"
|
||||
|
||||
content += "\n"
|
||||
|
||||
# 保存报告
|
||||
try:
|
||||
# 确保输出目录存在
|
||||
output_dir = os.path.dirname(output_path)
|
||||
if output_dir and not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
|
||||
print(f"错误报告已生成: {output_path}")
|
||||
except Exception as e:
|
||||
print(f"写入错误报告时出错: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("=" * 60)
|
||||
print("多线程版GitBook链接检查器")
|
||||
print("=" * 60)
|
||||
|
||||
# 获取SUMMARY.md文件路径
|
||||
if len(sys.argv) > 1:
|
||||
summary_path = sys.argv[1]
|
||||
else:
|
||||
summary_path = input("请输入SUMMARY.md文件路径: ").strip()
|
||||
if not summary_path:
|
||||
summary_path = os.path.join(os.getcwd(), "SUMMARY.md")
|
||||
print(f"使用默认路径: {summary_path}")
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.isfile(summary_path):
|
||||
print(f"错误: 文件 '{summary_path}' 不存在")
|
||||
sys.exit(1)
|
||||
|
||||
# 获取基础目录
|
||||
base_dir = os.path.dirname(os.path.abspath(summary_path))
|
||||
if len(sys.argv) > 2:
|
||||
base_dir = sys.argv[2]
|
||||
else:
|
||||
input_base_dir = input(f"请输入文档根目录 [默认: {base_dir}]: ").strip()
|
||||
if input_base_dir:
|
||||
base_dir = input_base_dir
|
||||
|
||||
# 获取输出文件路径
|
||||
if len(sys.argv) > 3:
|
||||
output_path = sys.argv[3]
|
||||
else:
|
||||
default_output = os.path.join(base_dir, "link-check-report.md")
|
||||
output_path = input(f"请输入输出文件路径 [默认: {default_output}]: ").strip()
|
||||
if not output_path:
|
||||
output_path = default_output
|
||||
|
||||
# 处理目录输出
|
||||
if os.path.isdir(output_path):
|
||||
output_path = os.path.join(output_path, "link-check-report.md")
|
||||
|
||||
# 询问是否验证在线链接
|
||||
verify_online = input("是否验证在线链接? (y/n) [默认: n]: ").strip().lower() == 'y'
|
||||
|
||||
max_threads = 10
|
||||
if verify_online:
|
||||
# 获取最大线程数
|
||||
try:
|
||||
max_threads = int(input(f"请输入最大线程数 [默认: 10]: ").strip() or "10")
|
||||
if max_threads < 1:
|
||||
max_threads = 10
|
||||
print(f"线程数必须大于0,已设置为默认值10")
|
||||
except ValueError:
|
||||
max_threads = 10
|
||||
print(f"输入无效,已设置为默认值10")
|
||||
|
||||
print(f"将使用 {max_threads} 个线程并行检查在线链接")
|
||||
else:
|
||||
print("未验证的在线链接将被标记为错误,并添加到错误报告中")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# 创建链接检查器并执行检查
|
||||
checker = LinkChecker(
|
||||
summary_path=summary_path,
|
||||
base_dir=base_dir,
|
||||
verify_online=verify_online,
|
||||
max_threads=max_threads
|
||||
)
|
||||
|
||||
checker.check_links()
|
||||
checker.generate_reports(output_path)
|
||||
|
||||
# 统计信息
|
||||
total_files = len(checker.processed_files)
|
||||
invalid_links = len(checker.invalid_links)
|
||||
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
|
||||
print(f"\n统计信息:")
|
||||
print(f"- 检查的文件数: {total_files}")
|
||||
print(f"- 无效链接数: {invalid_links}")
|
||||
print(f"- 耗时: {elapsed_time:.2f} 秒")
|
||||
|
||||
print("\n检查完成!")
|
||||
except Exception as e:
|
||||
print(f"执行过程中出错: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
176
scripts/extract-gitbook-url.py
Normal file
176
scripts/extract-gitbook-url.py
Normal file
@@ -0,0 +1,176 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
改进的GitBook Summary链接提取器 (支持目录输出)
|
||||
|
||||
此脚本从SUMMARY.md文件中提取所有内容,
|
||||
保留原始的目录结构和标题,
|
||||
将链接转换为在线URL(不包含.md后缀)。
|
||||
支持将输出文件放在指定目录中。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
|
||||
def process_summary_file(summary_path, base_url):
|
||||
"""
|
||||
处理SUMMARY.md文件,保留结构并转换链接
|
||||
|
||||
Args:
|
||||
summary_path: SUMMARY.md文件的路径
|
||||
base_url: 基础URL
|
||||
|
||||
Returns:
|
||||
processed_content: 处理后的内容
|
||||
"""
|
||||
print(f"正在处理 {summary_path}...")
|
||||
|
||||
try:
|
||||
with open(summary_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
except Exception as e:
|
||||
print(f"读取文件时出错: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# 确保base_url以/结尾
|
||||
if not base_url.endswith('/'):
|
||||
base_url += '/'
|
||||
|
||||
# 处理每一行
|
||||
lines = content.split('\n')
|
||||
processed_lines = []
|
||||
|
||||
for line in lines:
|
||||
# 提取行中的Markdown链接
|
||||
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
||||
matches = re.findall(link_pattern, line)
|
||||
|
||||
processed_line = line
|
||||
|
||||
# 替换每个链接
|
||||
for text, link in matches:
|
||||
# 跳过锚点链接
|
||||
if link.startswith('#'):
|
||||
continue
|
||||
|
||||
# 构建完整URL
|
||||
if not link.startswith(('http://', 'https://')):
|
||||
if link.startswith('/'):
|
||||
link = link[1:]
|
||||
full_url = urllib.parse.urljoin(base_url, link)
|
||||
else:
|
||||
full_url = link
|
||||
|
||||
# 移除.md后缀
|
||||
if full_url.endswith('.md'):
|
||||
full_url = full_url[:-3]
|
||||
|
||||
# 替换链接
|
||||
original_link = f"[{text}]({link})"
|
||||
new_link = f"[{text}]({full_url})"
|
||||
processed_line = processed_line.replace(original_link, new_link)
|
||||
|
||||
processed_lines.append(processed_line)
|
||||
|
||||
return '\n'.join(processed_lines)
|
||||
|
||||
|
||||
def save_to_markdown(content, output_path):
|
||||
"""
|
||||
保存处理后的内容到Markdown文件
|
||||
|
||||
Args:
|
||||
content: 处理后的内容
|
||||
output_path: 输出文件路径
|
||||
"""
|
||||
# 检查路径是否是目录
|
||||
if os.path.isdir(output_path):
|
||||
# 如果是目录,在该目录中创建默认文件名
|
||||
output_file = os.path.join(output_path, "gitbook-urls.md")
|
||||
else:
|
||||
# 否则使用提供的路径
|
||||
output_file = output_path
|
||||
|
||||
# 确保输出目录存在
|
||||
output_dir = os.path.dirname(output_file)
|
||||
if output_dir and not os.path.exists(output_dir):
|
||||
try:
|
||||
os.makedirs(output_dir)
|
||||
print(f"已创建目录: {output_dir}")
|
||||
except Exception as e:
|
||||
print(f"创建目录时出错: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
with open(output_file, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
print(f"Markdown文件已生成: {output_file}")
|
||||
except Exception as e:
|
||||
print(f"写入文件时出错: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def add_header(content):
|
||||
"""
|
||||
向内容添加标题和说明
|
||||
|
||||
Args:
|
||||
content: 原始内容
|
||||
|
||||
Returns:
|
||||
new_content: 添加标题和说明后的内容
|
||||
"""
|
||||
header = "# GitBook文档链接\n\n"
|
||||
header += "以下是从SUMMARY.md提取的文档结构和链接:\n\n"
|
||||
|
||||
return header + content
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=" * 60)
|
||||
print("改进的GitBook Summary链接提取器 (支持目录输出)")
|
||||
print("=" * 60)
|
||||
|
||||
# 获取SUMMARY.md文件路径
|
||||
if len(sys.argv) > 1:
|
||||
summary_path = sys.argv[1]
|
||||
else:
|
||||
summary_path = input("请输入SUMMARY.md文件路径: ").strip()
|
||||
if not summary_path:
|
||||
summary_path = os.path.join(os.getcwd(), "SUMMARY.md")
|
||||
print(f"使用默认路径: {summary_path}")
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.isfile(summary_path):
|
||||
print(f"错误: 文件 '{summary_path}' 不存在")
|
||||
sys.exit(1)
|
||||
|
||||
# 获取基础URL
|
||||
if len(sys.argv) > 2:
|
||||
base_url = sys.argv[2]
|
||||
else:
|
||||
base_url = input("请输入文档基础URL: ").strip()
|
||||
if not base_url:
|
||||
base_url = "https://docs.example.com/"
|
||||
print(f"使用默认URL: {base_url}")
|
||||
|
||||
# 获取输出文件路径或目录
|
||||
if len(sys.argv) > 3:
|
||||
output_path = sys.argv[3]
|
||||
else:
|
||||
default_output = os.path.join(os.path.dirname(summary_path), "gitbook-urls.md")
|
||||
output_path = input(f"请输入输出文件路径或目录 [默认: {default_output}]: ").strip()
|
||||
if not output_path:
|
||||
output_path = default_output
|
||||
|
||||
# 处理文件内容
|
||||
processed_content = process_summary_file(summary_path, base_url)
|
||||
|
||||
# 添加标题和说明
|
||||
final_content = add_header(processed_content)
|
||||
|
||||
# 保存到Markdown文件
|
||||
save_to_markdown(final_content, output_path)
|
||||
|
||||
print("\n处理完成!")
|
||||
367
scripts/extract-local-file-url.py
Normal file
367
scripts/extract-local-file-url.py
Normal file
@@ -0,0 +1,367 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
本地GitBook Markdown文件链接检查工具
|
||||
|
||||
此脚本会:
|
||||
1. 从SUMMARY.md提取所有文档链接
|
||||
2. 解析每个本地Markdown文件
|
||||
3. 提取并验证文件中的内部链接
|
||||
4. 生成链接检查报告
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import csv
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
# 尝试导入依赖,如果不存在则自动安装
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
import markdown
|
||||
except ImportError:
|
||||
print("正在安装必要依赖...")
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "beautifulsoup4", "markdown"])
|
||||
from bs4 import BeautifulSoup
|
||||
import markdown
|
||||
|
||||
|
||||
class GitbookLocalChecker:
|
||||
"""GitBook本地文件链接检查工具"""
|
||||
|
||||
def __init__(self, summary_path, base_dir=None, remove_md=True):
|
||||
"""
|
||||
初始化链接检查器
|
||||
|
||||
Args:
|
||||
summary_path: SUMMARY.md文件路径
|
||||
base_dir: 文档根目录,默认为SUMMARY.md所在目录
|
||||
remove_md: 是否移除.md后缀
|
||||
"""
|
||||
self.summary_path = os.path.abspath(summary_path)
|
||||
self.base_dir = base_dir or os.path.dirname(self.summary_path)
|
||||
self.remove_md = remove_md
|
||||
self.all_links = []
|
||||
self.all_md_files = []
|
||||
self.invalid_links = []
|
||||
|
||||
# 记录解析过的文件,避免重复处理
|
||||
self.processed_files = set()
|
||||
|
||||
def extract_summary_links(self):
|
||||
"""从SUMMARY.md提取所有Markdown文件链接"""
|
||||
print(f"正在从 {self.summary_path} 提取文档链接...")
|
||||
|
||||
with open(self.summary_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
|
||||
# 使用正则表达式提取链接
|
||||
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
||||
matches = re.findall(link_pattern, content)
|
||||
|
||||
links = []
|
||||
for i, (text, link) in enumerate(matches, 1):
|
||||
# 排除锚点链接
|
||||
if not link.startswith('#') and link.endswith('.md'):
|
||||
# 计算本地文件路径
|
||||
local_path = os.path.normpath(os.path.join(self.base_dir, link))
|
||||
|
||||
links.append({
|
||||
'id': i,
|
||||
'text': text,
|
||||
'link': link,
|
||||
'local_path': local_path,
|
||||
'exists': os.path.exists(local_path),
|
||||
'type': 'summary_link',
|
||||
'source_file': 'SUMMARY.md'
|
||||
})
|
||||
|
||||
# 将文件添加到待处理列表
|
||||
if os.path.exists(local_path):
|
||||
self.all_md_files.append(local_path)
|
||||
|
||||
print(f"找到 {len(links)} 个文档链接,{len(self.all_md_files)} 个本地Markdown文件")
|
||||
self.all_links.extend(links)
|
||||
return links
|
||||
|
||||
def process_md_file(self, file_path):
|
||||
"""处理单个Markdown文件,提取其中的链接"""
|
||||
# 如果文件已处理,跳过
|
||||
if file_path in self.processed_files:
|
||||
return []
|
||||
|
||||
self.processed_files.add(file_path)
|
||||
relative_path = os.path.relpath(file_path, self.base_dir)
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
|
||||
# 提取所有链接
|
||||
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
||||
matches = re.findall(link_pattern, content)
|
||||
|
||||
links = []
|
||||
for text, link in matches:
|
||||
# 排除外部链接和锚点链接
|
||||
if link.startswith(('http://', 'https://', '#')):
|
||||
continue
|
||||
|
||||
# 解析相对路径
|
||||
if link.startswith('/'):
|
||||
# 从根目录计算
|
||||
target_path = os.path.normpath(os.path.join(self.base_dir, link.lstrip('/')))
|
||||
else:
|
||||
# 从当前文件所在目录计算
|
||||
target_path = os.path.normpath(os.path.join(os.path.dirname(file_path), link))
|
||||
|
||||
# 如果链接没有扩展名但指向目录,添加README.md
|
||||
if not os.path.splitext(target_path)[1]:
|
||||
if os.path.isdir(target_path):
|
||||
target_path = os.path.join(target_path, 'README.md')
|
||||
else:
|
||||
# 可能是不带扩展名的文件引用,添加.md
|
||||
target_path += '.md'
|
||||
|
||||
# 检查链接是否有效
|
||||
exists = os.path.exists(target_path)
|
||||
|
||||
link_info = {
|
||||
'text': text,
|
||||
'link': link,
|
||||
'local_path': target_path,
|
||||
'target_file': os.path.basename(target_path),
|
||||
'exists': exists,
|
||||
'type': 'internal_link',
|
||||
'source_file': relative_path
|
||||
}
|
||||
|
||||
links.append(link_info)
|
||||
|
||||
# 如果链接无效,添加到无效链接列表
|
||||
if not exists:
|
||||
self.invalid_links.append(link_info)
|
||||
# 如果是有效的Markdown文件且尚未处理,添加到待处理列表
|
||||
elif target_path.endswith('.md') and target_path not in self.processed_files:
|
||||
self.all_md_files.append(target_path)
|
||||
|
||||
return links
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文件 {file_path} 时出错: {e}")
|
||||
return []
|
||||
|
||||
def process_all_files(self):
|
||||
"""处理所有Markdown文件"""
|
||||
print("开始处理所有Markdown文件...")
|
||||
|
||||
# 先提取SUMMARY.md中的链接
|
||||
self.extract_summary_links()
|
||||
|
||||
# 处理所有Markdown文件
|
||||
files_to_process = list(self.all_md_files) # 创建副本,因为处理过程中会添加新文件
|
||||
processed_count = 0
|
||||
|
||||
for file_path in files_to_process:
|
||||
if file_path not in self.processed_files:
|
||||
relative_path = os.path.relpath(file_path, self.base_dir)
|
||||
print(f"处理文件: {relative_path}")
|
||||
|
||||
links = self.process_md_file(file_path)
|
||||
self.all_links.extend(links)
|
||||
|
||||
processed_count += 1
|
||||
|
||||
# 如果发现新文件,可能需要处理它们
|
||||
new_files = [f for f in self.all_md_files if f not in files_to_process and f not in self.processed_files]
|
||||
files_to_process.extend(new_files)
|
||||
|
||||
print(f"已处理 {processed_count} 个Markdown文件")
|
||||
print(f"共找到 {len(self.all_links)} 个链接,其中 {len(self.invalid_links)} 个无效")
|
||||
|
||||
def generate_markdown_report(self, output_path):
|
||||
"""生成Markdown格式的报告"""
|
||||
print(f"正在生成报告: {output_path}")
|
||||
|
||||
content = f"""# GitBook本地链接检查报告
|
||||
|
||||
## 摘要
|
||||
- 检查时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
- 处理文件数: {len(self.processed_files)}
|
||||
- 总链接数: {len(self.all_links)}
|
||||
- 无效链接数: {len(self.invalid_links)}
|
||||
|
||||
## 无效链接列表
|
||||
"""
|
||||
|
||||
# 按源文件分组显示无效链接
|
||||
grouped_links = {}
|
||||
for link in self.invalid_links:
|
||||
source = link['source_file']
|
||||
if source not in grouped_links:
|
||||
grouped_links[source] = []
|
||||
grouped_links[source].append(link)
|
||||
|
||||
for source, links in sorted(grouped_links.items()):
|
||||
content += f"\n### 文件: {source}\n"
|
||||
for link in links:
|
||||
content += f"- [{link['text']}]({link['link']}) -> {link['local_path']} (无效)\n"
|
||||
|
||||
# 添加所有文件的链接统计
|
||||
content += "\n## 文件链接统计\n"
|
||||
file_stats = {}
|
||||
for link in self.all_links:
|
||||
source = link['source_file']
|
||||
if source not in file_stats:
|
||||
file_stats[source] = {'total': 0, 'invalid': 0}
|
||||
file_stats[source]['total'] += 1
|
||||
if not link['exists']:
|
||||
file_stats[source]['invalid'] += 1
|
||||
|
||||
for source, stats in sorted(file_stats.items()):
|
||||
content += f"- {source}: 共 {stats['total']} 个链接,{stats['invalid']} 个无效\n"
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
|
||||
print(f"报告已生成: {output_path}")
|
||||
|
||||
def generate_csv_report(self, output_path):
|
||||
"""生成CSV格式的报告"""
|
||||
print(f"正在生成CSV报告: {output_path}")
|
||||
|
||||
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['source_file', 'text', 'link', 'local_path', 'exists', 'type']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
for link in self.all_links:
|
||||
writer.writerow({
|
||||
'source_file': link['source_file'],
|
||||
'text': link['text'],
|
||||
'link': link['link'],
|
||||
'local_path': link['local_path'],
|
||||
'exists': link['exists'],
|
||||
'type': link['type']
|
||||
})
|
||||
|
||||
print(f"CSV报告已生成: {output_path}")
|
||||
|
||||
|
||||
def get_input_with_default(prompt, default=None):
|
||||
"""获取用户输入,如果为空则使用默认值"""
|
||||
if default:
|
||||
user_input = input(f"{prompt} [{default}]: ")
|
||||
return user_input if user_input.strip() else default
|
||||
else:
|
||||
return input(f"{prompt}: ")
|
||||
|
||||
|
||||
def get_yes_no_input(prompt, default="y"):
|
||||
"""获取用户是/否输入"""
|
||||
valid_responses = {
|
||||
'y': True, 'yes': True, '是': True,
|
||||
'n': False, 'no': False, '否': False
|
||||
}
|
||||
|
||||
if default.lower() in ['y', 'yes', '是']:
|
||||
prompt = f"{prompt} [Y/n]: "
|
||||
default_value = True
|
||||
else:
|
||||
prompt = f"{prompt} [y/N]: "
|
||||
default_value = False
|
||||
|
||||
user_input = input(prompt).lower()
|
||||
|
||||
if not user_input:
|
||||
return default_value
|
||||
|
||||
return valid_responses.get(user_input, default_value)
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数,交互式获取输入"""
|
||||
print("=" * 60)
|
||||
print("本地GitBook Markdown文件链接检查工具")
|
||||
print("=" * 60)
|
||||
|
||||
# 获取SUMMARY.md文件路径
|
||||
while True:
|
||||
summary_path = get_input_with_default(
|
||||
"请输入SUMMARY.md文件路径",
|
||||
os.path.join(os.getcwd(), "SUMMARY.md")
|
||||
)
|
||||
|
||||
# 检查文件是否存在
|
||||
if os.path.isfile(summary_path):
|
||||
break
|
||||
else:
|
||||
print(f"错误: 文件 '{summary_path}' 不存在")
|
||||
|
||||
# 获取文档根目录
|
||||
default_base_dir = os.path.dirname(os.path.abspath(summary_path))
|
||||
base_dir = get_input_with_default(
|
||||
"请输入文档根目录(包含所有Markdown文件的目录)",
|
||||
default_base_dir
|
||||
)
|
||||
|
||||
# 获取输出目录
|
||||
output_dir = get_input_with_default(
|
||||
"请输入输出目录",
|
||||
os.path.dirname(summary_path) or os.getcwd()
|
||||
)
|
||||
|
||||
# 确保输出目录存在
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 生成文件路径
|
||||
report_path = os.path.join(output_dir, "gitbook-links-report.md")
|
||||
csv_path = os.path.join(output_dir, "gitbook-links-report.csv")
|
||||
|
||||
# 询问是否移除.md后缀
|
||||
remove_md = get_yes_no_input("是否移除链接中的.md后缀", "y")
|
||||
|
||||
try:
|
||||
# 创建检查器实例
|
||||
checker = GitbookLocalChecker(
|
||||
summary_path=summary_path,
|
||||
base_dir=base_dir,
|
||||
remove_md=remove_md
|
||||
)
|
||||
|
||||
# 处理所有文件
|
||||
checker.process_all_files()
|
||||
|
||||
# 生成报告
|
||||
checker.generate_markdown_report(report_path)
|
||||
checker.generate_csv_report(csv_path)
|
||||
|
||||
print("\n检查完成!")
|
||||
print(f"Markdown报告: {report_path}")
|
||||
print(f"CSV报告: {csv_path}")
|
||||
|
||||
# 显示摘要
|
||||
print(f"\n摘要:")
|
||||
print(f"- 处理文件数: {len(checker.processed_files)}")
|
||||
print(f"- 总链接数: {len(checker.all_links)}")
|
||||
print(f"- 无效链接数: {len(checker.invalid_links)}")
|
||||
|
||||
if checker.invalid_links:
|
||||
print("\n无效链接示例:")
|
||||
for i, link in enumerate(checker.invalid_links[:5], 1):
|
||||
print(f"{i}. 文件 '{link['source_file']}' 中 [{link['text']}]({link['link']}) -> {link['local_path']} (无效)")
|
||||
|
||||
if len(checker.invalid_links) > 5:
|
||||
print(f"... 以及其他 {len(checker.invalid_links) - 5} 个无效链接")
|
||||
|
||||
except Exception as e:
|
||||
print(f"执行过程中出错: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
@@ -18,7 +19,9 @@ logging.basicConfig(
|
||||
logger = logging.getLogger("md-to-mdx")
|
||||
|
||||
class MarkdownToMDXConverter:
|
||||
def __init__(self):
|
||||
def __init__(self, backup=True, in_place=False):
|
||||
self.backup = backup
|
||||
self.in_place = in_place
|
||||
self.conversion_count = 0
|
||||
self.error_count = 0
|
||||
self.base_output_dir = None
|
||||
@@ -31,90 +34,351 @@ class MarkdownToMDXConverter:
|
||||
logger.error(f"输入目录不存在: {input_dir}")
|
||||
return
|
||||
|
||||
if self.base_output_dir is None and output_dir:
|
||||
# 保存基础输出目录,用于构建子目录输出路径
|
||||
if not self.in_place and self.base_output_dir is None and output_dir:
|
||||
self.base_output_dir = Path(output_dir)
|
||||
self.base_input_dir = input_path
|
||||
self.base_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"创建基础输出目录: {self.base_output_dir}")
|
||||
|
||||
for file in input_path.glob("*.md"):
|
||||
if self.base_output_dir:
|
||||
rel_path = file.parent.relative_to(self.base_input_dir) if file.parent != self.base_input_dir else Path('')
|
||||
target_dir = self.base_output_dir / rel_path
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._process_file(file, target_dir)
|
||||
# 处理当前目录中的所有.md和.mdx文件
|
||||
for file in list(input_path.glob("*.md")) + list(input_path.glob("*.mdx")):
|
||||
if self.in_place:
|
||||
# 在原位置处理
|
||||
self._process_file(file, file.parent, delete_original=True)
|
||||
else:
|
||||
self._process_file(file, file.parent)
|
||||
# 计算相对于基础输入目录的路径
|
||||
if self.base_output_dir:
|
||||
rel_path = file.parent.relative_to(self.base_input_dir) if file.parent != self.base_input_dir else Path('')
|
||||
target_dir = self.base_output_dir / rel_path
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._process_file(file, target_dir)
|
||||
else:
|
||||
# 如果没有基础输出目录,则就地处理
|
||||
self._process_file(file, file.parent)
|
||||
|
||||
# 如果需要递归处理子目录
|
||||
if recursive:
|
||||
for subdir in [d for d in input_path.iterdir() if d.is_dir()]:
|
||||
# 跳过output目录,避免重复处理
|
||||
if subdir.name == "output" or subdir.name.startswith('.'):
|
||||
continue
|
||||
|
||||
self.process_directory(subdir, output_dir, recursive)
|
||||
|
||||
def _process_file(self, file_path, output_dir):
|
||||
def _process_file(self, file_path, output_dir, delete_original=False):
|
||||
"""处理单个Markdown文件"""
|
||||
try:
|
||||
logger.info(f"处理文件: {file_path}")
|
||||
|
||||
# 备份原始文件(如果需要)
|
||||
if self.backup:
|
||||
backup_file = str(file_path) + ".bak"
|
||||
if not os.path.exists(backup_file):
|
||||
shutil.copy2(file_path, backup_file)
|
||||
logger.info(f"已创建备份: {backup_file}")
|
||||
|
||||
# 读取文件内容
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
content = self._fix_broken_text(content)
|
||||
content = self._convert_images(content)
|
||||
content = self._convert_hints(content)
|
||||
# 执行转换
|
||||
converted_content = self.convert_content(content)
|
||||
|
||||
# 确定输出文件路径
|
||||
output_file = output_dir / (file_path.stem + ".mdx")
|
||||
|
||||
# 写入转换后的内容
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(converted_content)
|
||||
|
||||
logger.info(f"转换完成: {output_file}")
|
||||
self.conversion_count += 1
|
||||
|
||||
# 如果需要,删除原始文件
|
||||
if delete_original:
|
||||
try:
|
||||
os.remove(file_path)
|
||||
logger.info(f"已删除源文件: {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"删除源文件 {file_path} 失败: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理文件 {file_path} 时出错: {str(e)}")
|
||||
self.error_count += 1
|
||||
|
||||
def _fix_broken_text(self, content):
|
||||
"""修复文本中的割裂问题,特别是在代码块周围"""
|
||||
broken_code_pattern = re.compile(r'```([a-zA-Z]*)\r?\n(.*?)\r?\n```([a-zA-Z]*)', re.DOTALL)
|
||||
content = broken_code_pattern.sub(r'```\1\n\2\n```', content)
|
||||
return content
|
||||
|
||||
def _convert_images(self, content):
|
||||
"""转换HTML图片格式为Markdown或MDX格式"""
|
||||
|
||||
# 转换没有标题的 <figure><img> 结构
|
||||
img_pattern_no_caption = re.compile(r'<figure>\s*<img src="([^"]+)" alt="([^"]*)">\s*<figcaption></figcaption>\s*</figure>', re.DOTALL)
|
||||
content = img_pattern_no_caption.sub(r'', content)
|
||||
|
||||
# 转换带标题的 <figure><img> 结构
|
||||
img_pattern_with_caption = re.compile(r'<figure>\s*<img src="([^"]+)" alt="([^"]*)">\s*<figcaption><p>(.*?)</p></figcaption>\s*</figure>', re.DOTALL)
|
||||
def img_replacer(match):
|
||||
img_src = match.group(1)
|
||||
alt_text = match.group(3).strip()
|
||||
return f''
|
||||
content = img_pattern_with_caption.sub(img_replacer, content)
|
||||
|
||||
return content
|
||||
|
||||
def _convert_hints(self, content):
|
||||
"""转换 hint 提示框"""
|
||||
hint_pattern = re.compile(r'{%\s*hint\s*style="info"\s*%}\s*{%\s*endhint\s*%}', re.DOTALL)
|
||||
content = hint_pattern.sub(r'<Info>\n</Info>', content)
|
||||
return content
|
||||
|
||||
def convert_content(self, content):
|
||||
"""将Gitbook Markdown内容转换为Mintlify MDX格式"""
|
||||
|
||||
# 1. 转换文档开头的h1元素为frontmatter
|
||||
h1_pattern = re.compile(r'^#\s+(.+?)$', re.MULTILINE)
|
||||
match = h1_pattern.search(content)
|
||||
if match:
|
||||
title = match.group(1).strip()
|
||||
content = h1_pattern.sub(f'---\ntitle: {title}\n---\n', content, count=1)
|
||||
|
||||
# 2. 转换hint提示框
|
||||
hint_pattern = re.compile(
|
||||
r'{%\s*hint\s+style="(\w+)"\s*%}(.*?){%\s*endhint\s*%}',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
def hint_replacer(match):
|
||||
style = match.group(1)
|
||||
text = match.group(2).strip()
|
||||
component_name = style.capitalize() if style != "info" else "Info"
|
||||
return f'<{component_name}>\n{text}\n</{component_name}>'
|
||||
|
||||
content = hint_pattern.sub(hint_replacer, content)
|
||||
|
||||
# 3. 转换卡片链接
|
||||
card_pattern = re.compile(
|
||||
r'{%\s*content-ref\s+url="([^"]+)"\s*%}\s*\[([^\]]+)\]\(([^)]+)\)\s*{%\s*endcontent-ref\s*%}',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
def card_replacer(match):
|
||||
url = match.group(1)
|
||||
title = match.group(2)
|
||||
return f'<Card title="{title}" icon="link" href="{url}">\n {title}\n</Card>'
|
||||
|
||||
content = card_pattern.sub(card_replacer, content)
|
||||
|
||||
# 4. 转换并排图片样式
|
||||
# 寻找连续的图片并转换为并排布局
|
||||
img_pattern = re.compile(r'!\[(.*?)\]\((.*?)\)\s*!\[(.*?)\]\((.*?)\)', re.DOTALL)
|
||||
|
||||
def img_side_replacer(match):
|
||||
alt1 = match.group(1) or "Image 1"
|
||||
src1 = match.group(2)
|
||||
alt2 = match.group(3) or "Image 2"
|
||||
src2 = match.group(4)
|
||||
|
||||
return f'''<div class="image-side-by-side">
|
||||
<figure>
|
||||
<img src="{src1}" alt="{alt1}" />
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="{src2}" alt="{alt2}" />
|
||||
</figure>
|
||||
</div>'''
|
||||
|
||||
content = img_pattern.sub(img_side_replacer, content)
|
||||
|
||||
# 5. 转换Frame包装的图片
|
||||
frame_pattern = re.compile(r'<Frame>\s*<img\s+src="([^"]+)"\s+alt="([^"]+)"\s*/>\s*</Frame>', re.DOTALL)
|
||||
|
||||
def frame_replacer(match):
|
||||
src = match.group(1)
|
||||
alt = match.group(2)
|
||||
return f''
|
||||
|
||||
content = frame_pattern.sub(frame_replacer, content)
|
||||
|
||||
# 5.1 转换<figure><img>格式的带有宽度和figcaption的图片为特定格式
|
||||
figure_img_width_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s+width="(\d+)"\s*/?>\s*<figcaption>(?:<p>)?(.*?)(?:</p>)?</figcaption>\s*</figure>', re.DOTALL)
|
||||
|
||||
def figure_img_width_caption_replacer(match):
|
||||
src = match.group(1)
|
||||
alt = match.group(2) or ""
|
||||
width = match.group(3)
|
||||
caption = match.group(4).strip()
|
||||
|
||||
# 如果有caption,将其添加到alt中
|
||||
if caption:
|
||||
alt = caption
|
||||
|
||||
return f'''<img
|
||||
src="{src}"
|
||||
width="{width}"
|
||||
className="mx-auto"
|
||||
alt="{alt}"
|
||||
/>'''
|
||||
|
||||
content = figure_img_width_caption_pattern.sub(figure_img_width_caption_replacer, content)
|
||||
|
||||
# 5.2 转换<figure><img>格式的带有宽度但没有figcaption的图片
|
||||
figure_img_width_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s+width="(\d+)"\s*/?>\s*</figure>', re.DOTALL)
|
||||
|
||||
def figure_img_width_replacer(match):
|
||||
src = match.group(1)
|
||||
alt = match.group(2) or ""
|
||||
width = match.group(3)
|
||||
|
||||
return f'''<img
|
||||
src="{src}"
|
||||
width="{width}"
|
||||
className="mx-auto"
|
||||
alt="{alt}"
|
||||
/>'''
|
||||
|
||||
content = figure_img_width_pattern.sub(figure_img_width_replacer, content)
|
||||
|
||||
# 5.3 转换<figure><img>格式的没有宽度但有figcaption的图片
|
||||
figure_img_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s*/?>\s*<figcaption>(?:<p>)?(.*?)(?:</p>)?</figcaption>\s*</figure>', re.DOTALL)
|
||||
|
||||
def figure_img_caption_replacer(match):
|
||||
src = match.group(1)
|
||||
alt = match.group(2) or ""
|
||||
caption = match.group(3).strip()
|
||||
|
||||
# 如果有caption,将其添加到alt中
|
||||
if caption:
|
||||
alt = caption
|
||||
|
||||
return f'''<img
|
||||
src="{src}"
|
||||
className="mx-auto"
|
||||
alt="{alt}"
|
||||
/>'''
|
||||
|
||||
content = figure_img_caption_pattern.sub(figure_img_caption_replacer, content)
|
||||
|
||||
# 5.4 处理没有figcaption和宽度的<figure><img>标签
|
||||
figure_img_no_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s*/?>\s*</figure>', re.DOTALL)
|
||||
|
||||
def figure_img_no_caption_replacer(match):
|
||||
src = match.group(1)
|
||||
alt = match.group(2) or ""
|
||||
|
||||
return f'''<img
|
||||
src="{src}"
|
||||
className="mx-auto"
|
||||
alt="{alt}"
|
||||
/>'''
|
||||
|
||||
content = figure_img_no_caption_pattern.sub(figure_img_no_caption_replacer, content)
|
||||
|
||||
# 6. 转换Tabs组件
|
||||
# 先匹配整个tabs块
|
||||
tabs_pattern = re.compile(
|
||||
r'{%\s*tabs\s*%}(.*?){%\s*endtabs\s*%}',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
def tabs_replacer(match):
|
||||
tabs_content = match.group(1)
|
||||
# 匹配每个tab
|
||||
tab_pattern = re.compile(
|
||||
r'{%\s*tab\s+title="([^"]+)"\s*%}(.*?){%\s*endtab\s*%}',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
# 构建新的Tabs组件
|
||||
tabs_start = "<Tabs>"
|
||||
tabs_items = []
|
||||
|
||||
for tab_match in tab_pattern.finditer(tabs_content):
|
||||
title = tab_match.group(1)
|
||||
content = tab_match.group(2).strip()
|
||||
tabs_items.append(f' <Tab title="{title}">\n {content}\n </Tab>')
|
||||
|
||||
tabs_end = "</Tabs>"
|
||||
|
||||
return tabs_start + "\n" + "\n".join(tabs_items) + "\n" + tabs_end
|
||||
|
||||
content = tabs_pattern.sub(tabs_replacer, content)
|
||||
|
||||
# 7. 处理有限制大小的独立img标签
|
||||
img_size_pattern = re.compile(r'<img\s+src="([^"]+)"\s+width="(\d+)"(?:\s+alt="([^"]*)")?\s*/>', re.DOTALL)
|
||||
|
||||
def img_size_replacer(match):
|
||||
src = match.group(1)
|
||||
width = match.group(2)
|
||||
alt = match.group(3) if match.group(3) else ""
|
||||
|
||||
return f'''<img
|
||||
src="{src}"
|
||||
width="{width}"
|
||||
className="mx-auto"
|
||||
alt="{alt}"
|
||||
/>'''
|
||||
|
||||
content = img_size_pattern.sub(img_size_replacer, content)
|
||||
|
||||
# 7.1 处理各种形式的独立<img>标签
|
||||
standalone_img_pattern = re.compile(r'<img\s+src="([^"]+)"(?:\s+alt="([^"]*)")?[^>]*>', re.DOTALL)
|
||||
|
||||
def standalone_img_replacer(match):
|
||||
src = match.group(1)
|
||||
alt = match.group(2) if match.group(2) else ""
|
||||
|
||||
return f'''<img
|
||||
src="{src}"
|
||||
className="mx-auto"
|
||||
alt="{alt}"
|
||||
/>'''
|
||||
|
||||
content = standalone_img_pattern.sub(standalone_img_replacer, content)
|
||||
|
||||
# 8. 将markdown表格转换为MDX表格格式
|
||||
# 使用正则表达式匹配markdown表格
|
||||
table_pattern = re.compile(r'(\|.*\|\n\|[-:\s|]*\|\n(?:\|.*\|\n)+)', re.MULTILINE)
|
||||
|
||||
def table_replacer(match):
|
||||
md_table = match.group(1)
|
||||
lines = md_table.strip().split('\n')
|
||||
|
||||
# 提取表头和表体
|
||||
header_row = lines[0]
|
||||
header_cells = [cell.strip() for cell in header_row.split('|')[1:-1]]
|
||||
|
||||
# 忽略分隔行
|
||||
body_rows = lines[2:]
|
||||
body_cells_rows = []
|
||||
for row in body_rows:
|
||||
cells = [cell.strip() for cell in row.split('|')[1:-1]]
|
||||
body_cells_rows.append(cells)
|
||||
|
||||
# 按照要求的格式构建MDX表格
|
||||
mdx_table = "<table>\n <thead>\n <tr>\n"
|
||||
|
||||
# 添加表头
|
||||
for cell in header_cells:
|
||||
mdx_table += f" <th>{cell}</th>\n"
|
||||
|
||||
mdx_table += " </tr>\n </thead>\n <tbody>\n"
|
||||
|
||||
# 添加表体
|
||||
for row_cells in body_cells_rows:
|
||||
mdx_table += " <tr>\n"
|
||||
for cell in row_cells:
|
||||
# 先转换Markdown链接为HTML链接
|
||||
# 匹配 [text](url) 格式
|
||||
link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
|
||||
cell = link_pattern.sub(r'<a href="\2">\1</a>', cell)
|
||||
|
||||
# 替换<br>标签为</p><p>,实现正确的段落分隔
|
||||
# 先处理<br>标签(可能有不同形式:<br>, <br/>, <br />)
|
||||
br_pattern = re.compile(r'<br\s*/?>')
|
||||
|
||||
# 处理单元格中的<p>和<br>标签
|
||||
if '<p>' in cell or br_pattern.search(cell):
|
||||
# 如果已有<p>标签但包含<br>,替换<br>为</p><p>
|
||||
if '<p>' in cell and br_pattern.search(cell):
|
||||
cell = br_pattern.sub(r'</p>\n <p>', cell)
|
||||
# 清理末尾的空<br>标签
|
||||
cell = re.sub(r'<br\s*/?>(\s*</p>)', r'\1', cell)
|
||||
# 如果没有<p>标签但有<br>,用<p>标签包装每个段落
|
||||
elif br_pattern.search(cell) and not '<p>' in cell:
|
||||
paragraphs = br_pattern.split(cell)
|
||||
cell = '<p>' + '</p>\n <p>'.join([p.strip() for p in paragraphs if p.strip()]) + '</p>'
|
||||
|
||||
# 确保缩进正确
|
||||
mdx_table += f" <td>\n {cell}\n </td>\n"
|
||||
else:
|
||||
# 普通文本单元格
|
||||
mdx_table += f" <td>{cell}</td>\n"
|
||||
mdx_table += " </tr>\n"
|
||||
|
||||
mdx_table += " </tbody>\n</table>"
|
||||
|
||||
return mdx_table
|
||||
|
||||
content = table_pattern.sub(table_replacer, content)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def get_statistics(self):
|
||||
"""返回处理统计信息"""
|
||||
return {
|
||||
@@ -127,6 +391,7 @@ def main():
|
||||
print("Gitbook Markdown 转 Mintlify MDX 转换工具")
|
||||
print("=" * 60)
|
||||
|
||||
# 通过交互方式获取输入路径
|
||||
input_path_str = input("请输入源文件或目录路径: ")
|
||||
input_path = Path(input_path_str)
|
||||
|
||||
@@ -134,34 +399,54 @@ def main():
|
||||
print(f"错误: 路径 '{input_path_str}' 不存在!")
|
||||
return
|
||||
|
||||
# 询问是否递归处理子目录
|
||||
recursive = False
|
||||
if input_path.is_dir():
|
||||
recursive_input = input("是否递归处理所有子目录? (y/n): ").lower()
|
||||
recursive = recursive_input in ('y', 'yes')
|
||||
|
||||
if input_path.is_file():
|
||||
output_dir = input_path.parent / "output"
|
||||
else:
|
||||
output_dir = input_path / "output"
|
||||
# 询问是否创建备份
|
||||
backup_input = input("是否创建备份文件? (y/n, 默认:y): ").lower()
|
||||
create_backup = backup_input in ('', 'y', 'yes')
|
||||
|
||||
converter = MarkdownToMDXConverter()
|
||||
# 询问是否原地转换并删除源文件
|
||||
in_place_input = input("是否在原地转换并删除源文件? (y/n, 默认:n): ").lower()
|
||||
in_place = in_place_input in ('y', 'yes')
|
||||
|
||||
if input_path.is_file() and input_path.suffix.lower() == '.md':
|
||||
# 确定输出目录
|
||||
output_dir = None
|
||||
if not in_place:
|
||||
if input_path.is_file():
|
||||
output_dir = input_path.parent / "output"
|
||||
else:
|
||||
output_dir = input_path / "output"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(f"输出目录已创建: {output_dir}")
|
||||
converter._process_file(input_path, output_dir)
|
||||
|
||||
# 创建转换器并处理文件
|
||||
converter = MarkdownToMDXConverter(backup=create_backup, in_place=in_place)
|
||||
|
||||
if input_path.is_file() and input_path.suffix.lower() == '.md':
|
||||
# 处理单个文件
|
||||
if in_place:
|
||||
converter._process_file(input_path, input_path.parent, delete_original=True)
|
||||
else:
|
||||
converter._process_file(input_path, output_dir)
|
||||
elif input_path.is_dir():
|
||||
# 处理目录
|
||||
converter.process_directory(input_path, output_dir, recursive)
|
||||
else:
|
||||
logger.error(f"无效的输入路径: {input_path_str}")
|
||||
print(f"错误: '{input_path_str}' 不是有效的Markdown文件或目录!")
|
||||
return
|
||||
|
||||
# 打印统计信息
|
||||
stats = converter.get_statistics()
|
||||
print("=" * 60)
|
||||
print(f"转换完成! 成功转换: {stats['conversion_count']}个文件, 错误: {stats['error_count']}个文件")
|
||||
print(f"转换结果已保存至: {output_dir}")
|
||||
if not in_place and output_dir:
|
||||
print(f"转换结果已保存至: {output_dir}")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
Reference in New Issue
Block a user