Feat: update jp docs

This commit is contained in:
AllenWriter
2025-03-21 20:15:22 +08:00
parent c4225ab529
commit 5e6924c7ec
427 changed files with 35687 additions and 5735 deletions

757
scripts/auto-url-check.py Normal file
View File

@@ -0,0 +1,757 @@
#!/usr/bin/env python3
"""
多线程版GitBook链接检查器
此脚本使用多线程并行检查在线链接,大幅提高检查速度。
生成两个报告文件:
1. 包含所有链接的完整报告
2. 仅包含错误链接的报告
"""
import os
import re
import sys
import time
import threading
import queue
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
from urllib.parse import urlparse
try:
import requests
from requests.exceptions import RequestException
except ImportError:
print("正在安装requests库...")
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])
import requests
from requests.exceptions import RequestException
class LinkChecker:
def __init__(self, summary_path, base_dir=None, verify_online=True, max_threads=10):
"""
初始化链接检查器
Args:
summary_path: SUMMARY.md文件路径
base_dir: 文档根目录默认为SUMMARY.md所在目录
verify_online: 是否验证在线链接
max_threads: 最大线程数
"""
self.summary_path = os.path.abspath(summary_path)
self.base_dir = base_dir or os.path.dirname(self.summary_path)
self.verify_online = verify_online
self.max_threads = max_threads
self.summary_links = [] # SUMMARY.md中的链接
self.md_links = defaultdict(list) # 每个文档中引用的链接
self.processed_files = set() # 已处理的文件
self.summary_content = "" # SUMMARY.md的内容
self.invalid_links = [] # 存储所有无效链接
# 图片文件扩展名
self.image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.bmp', '.tiff', '.webp')
# 在线链接缓存,避免重复检查
self.online_link_cache = {}
self.online_link_cache_lock = threading.Lock() # 线程安全的缓存锁
# 用于存储待检查的在线链接
self.online_links_queue = queue.Queue()
# 进度统计
self.total_online_links = 0
self.checked_online_links = 0
self.progress_lock = threading.Lock()
def is_image_link(self, link):
"""
检查链接是否为图片链接
Args:
link: 链接路径
Returns:
is_image: 是否为图片链接
"""
return link.lower().endswith(self.image_extensions)
def check_online_link(self, url):
"""
检查在线链接是否有效
Args:
url: 在线链接URL
Returns:
is_valid: 链接是否有效
"""
# 如果已经检查过,直接返回缓存结果
with self.online_link_cache_lock:
if url in self.online_link_cache:
return self.online_link_cache[url]
if not self.verify_online:
# 如果不验证在线链接,默认返回无效
with self.online_link_cache_lock:
self.online_link_cache[url] = False
return False
try:
# 先尝试HEAD请求速度更快
response = requests.head(
url,
timeout=5,
allow_redirects=True,
headers={'User-Agent': 'Mozilla/5.0 GitBook-Link-Checker/1.0'}
)
if response.status_code < 400:
# 状态码小于400认为链接有效
with self.online_link_cache_lock:
self.online_link_cache[url] = True
return True
# HEAD请求失败尝试GET请求
response = requests.get(
url,
timeout=5,
allow_redirects=True,
headers={'User-Agent': 'Mozilla/5.0 GitBook-Link-Checker/1.0'}
)
result = response.status_code < 400
with self.online_link_cache_lock:
self.online_link_cache[url] = result
return result
except RequestException:
# 请求异常,链接无效
with self.online_link_cache_lock:
self.online_link_cache[url] = False
return False
def resolve_path(self, link, current_dir):
"""
解析链接的实际路径
Args:
link: 链接路径
current_dir: 当前文件所在目录
Returns:
resolved_path: 解析后的路径
is_external: 是否为外部链接
is_valid: 链接是否有效
"""
if not link:
return None, False, False
# 处理锚点链接
if '#' in link:
link_part = link.split('#')[0]
if not link_part: # 如果只有锚点,没有路径部分
return None, False, True # 假设内部锚点是有效的
link = link_part
# 检查是否为图片链接
if self.is_image_link(link):
return None, False, True # 跳过图片链接,并假设它们是有效的
# 处理外部链接
if link.startswith(('http://', 'https://', 'mailto:', 'tel:')):
# 如果是http/https链接加入待检查队列
if link.startswith(('http://', 'https://')) and self.verify_online:
# 将链接添加到待检查队列
self.online_links_queue.put(link)
with self.progress_lock:
self.total_online_links += 1
# 暂时返回未知状态,后续会更新
return link, True, None
elif link.startswith(('http://', 'https://')) and not self.verify_online:
# 如果不验证在线链接,标记为错误
return link, True, False
else:
# mailto和tel链接默认有效
return link, True, True
# 处理绝对路径 (从文档根目录开始)
if link.startswith('/'):
resolved_path = os.path.normpath(os.path.join(self.base_dir, link.lstrip('/')))
# 处理相对路径 (从当前文件所在目录开始)
else:
resolved_path = os.path.normpath(os.path.join(current_dir, link))
# 处理目录链接
if os.path.isdir(resolved_path):
readme_path = os.path.join(resolved_path, 'README.md')
if os.path.exists(readme_path):
return readme_path, False, True
index_path = os.path.join(resolved_path, 'index.md')
if os.path.exists(index_path):
return index_path, False, True
# 如果没有README.md或index.md保持原样
return resolved_path, False, os.path.exists(resolved_path)
# 处理不带扩展名的文件引用
if not os.path.exists(resolved_path) and '.' not in os.path.basename(resolved_path):
md_path = f"{resolved_path}.md"
if os.path.exists(md_path):
return md_path, False, True
return resolved_path, False, os.path.exists(resolved_path)
def online_link_worker(self):
"""工作线程:处理在线链接检查"""
while True:
try:
# 从队列获取链接
url = self.online_links_queue.get(block=False)
# 检查链接
is_valid = self.check_online_link(url)
# 更新进度
with self.progress_lock:
self.checked_online_links += 1
checked = self.checked_online_links
total = self.total_online_links
# 显示进度
print(f"在线链接检查进度: [{checked}/{total}] - {url} - {'' if is_valid else ''}")
# 标记任务完成
self.online_links_queue.task_done()
except queue.Empty:
# 队列为空,退出线程
break
def extract_sections_from_summary(self):
"""
从SUMMARY.md提取所有章节信息
Returns:
sections: 章节列表
"""
print(f"{self.summary_path} 提取章节信息...")
try:
with open(self.summary_path, 'r', encoding='utf-8') as file:
self.summary_content = file.read()
except Exception as e:
print(f"读取文件时出错: {e}")
sys.exit(1)
# 提取所有章节标题
sections = []
section_pattern = r'^#+\s+(.*?)(?:\s+<a.*?>)?$'
for line in self.summary_content.split('\n'):
match = re.match(section_pattern, line)
if match:
section_title = match.group(1).strip()
sections.append(section_title)
return sections
def extract_links_from_summary(self):
"""
从SUMMARY.md提取所有链接及其层级结构
Returns:
links: 链接列表,每项包含链接信息和层级
"""
print(f"{self.summary_path} 提取链接...")
# 记录当前所在章节
current_section = ""
sections = self.extract_sections_from_summary()
# 按行处理SUMMARY文件
links = []
for line in self.summary_content.split('\n'):
# 检查是否是章节标题行
section_match = re.match(r'^#+\s+(.*?)(?:\s+<a.*?>)?$', line)
if section_match:
current_section = section_match.group(1).strip()
continue
# 检查缩进级别
indent_match = re.match(r'^(\s*)\*', line)
if not indent_match:
continue
indent = indent_match.group(1)
level = len(indent) // 2 # 假设每级缩进是2个空格
# 提取链接
link_match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', line)
if not link_match:
continue
text, link = link_match.groups()
# 跳过只有锚点的链接
if link.startswith('#'):
continue
# 解析实际文件路径
file_path, is_external, is_valid = self.resolve_path(link, self.base_dir)
# 添加链接
link_info = {
'text': text,
'link': link,
'file_path': file_path,
'exists': is_valid,
'level': level,
'section': current_section,
'is_external': is_external,
'children': [], # 用于存储子链接
'source_file': 'SUMMARY.md'
}
links.append(link_info)
# 如果链接无效,添加到无效链接列表
if is_valid is False: # 注意is_valid可能为None在线链接待检查
self.invalid_links.append(link_info)
# 构建层级结构
root_links = []
level_stack = [None] # 用于跟踪每个级别的最后一个链接
for link in links:
level = link['level']
# 调整栈以匹配当前级别
while len(level_stack) > level + 1:
level_stack.pop()
# 扩展栈以匹配当前级别
while len(level_stack) < level + 1:
level_stack.append(None)
if level == 0:
# 顶级链接
root_links.append(link)
else:
# 子链接添加到父链接的children列表中
parent = level_stack[level - 1]
if parent:
parent['children'].append(link)
# 更新当前级别的最后一个链接
level_stack[level] = link
self.summary_links = root_links
return links
def extract_links_from_markdown(self, file_path):
"""
从Markdown文件中提取链接
Args:
file_path: Markdown文件路径
Returns:
links: 提取的链接列表
"""
if not file_path or file_path in self.processed_files:
return []
if not os.path.exists(file_path) or not file_path.endswith('.md'):
return []
self.processed_files.add(file_path)
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
except Exception as e:
print(f"读取文件 {file_path} 时出错: {e}")
return []
# 提取链接
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
matches = re.findall(link_pattern, content)
links = []
current_dir = os.path.dirname(file_path)
relative_source_path = os.path.relpath(file_path, self.base_dir)
for text, link in matches:
# 检查是否为图片链接
if self.is_image_link(link):
continue
# 解析链接
resolved_path, is_external, is_valid = self.resolve_path(link, current_dir)
# 添加链接
link_info = {
'text': text,
'link': link,
'file_path': resolved_path,
'exists': is_valid,
'is_external': is_external,
'source_file': relative_source_path
}
links.append(link_info)
# 存储到字典中,以文件路径为键
if file_path not in self.md_links:
self.md_links[file_path] = []
self.md_links[file_path].append(link_info)
# 如果链接无效,添加到无效链接列表
if is_valid is False: # 注意is_valid可能为None在线链接待检查
self.invalid_links.append(link_info)
return links
def check_links(self):
"""
递归检查所有链接
"""
# 提取SUMMARY中的链接
self.extract_links_from_summary()
# 递归处理每个链接
def process_link(link):
if not link.get('is_external') and link.get('exists') and link.get('file_path') and link.get('file_path').endswith('.md'):
try:
relative_path = os.path.relpath(link['file_path'], self.base_dir)
print(f"检查文件: {relative_path}")
self.extract_links_from_markdown(link['file_path'])
except Exception as e:
print(f"处理文件 {link.get('file_path')} 时出错: {e}")
# 递归处理子链接
for child in link.get('children', []):
process_link(child)
# 处理所有顶级链接
for link in self.summary_links:
process_link(link)
# 如果需要验证在线链接,启动多线程进行检查
if self.verify_online and self.total_online_links > 0:
self.check_online_links_with_threads()
# 更新链接状态
self.update_link_statuses()
def check_online_links_with_threads(self):
"""使用多线程检查在线链接"""
print(f"\n开始使用多线程检查在线链接,共有 {self.total_online_links} 个链接...")
# 创建线程池
num_threads = min(self.max_threads, self.total_online_links)
with ThreadPoolExecutor(max_workers=num_threads) as executor:
# 提交任务
futures = [executor.submit(self.online_link_worker) for _ in range(num_threads)]
# 等待队列任务完成
self.online_links_queue.join()
print(f"所有在线链接检查完成,共 {self.total_online_links}")
def update_link_statuses(self):
"""根据检查结果更新链接状态"""
# 更新所有链接的有效性状态
def update_link(link):
if link.get('is_external') and link.get('file_path') and link.get('file_path').startswith(('http://', 'https://')):
with self.online_link_cache_lock:
is_valid = self.online_link_cache.get(link['file_path'], False)
link['exists'] = is_valid
# 如果链接无效,添加到无效链接列表
if not is_valid and link not in self.invalid_links:
self.invalid_links.append(link)
# 递归处理子链接
for child in link.get('children', []):
update_link(child)
# 处理所有顶级链接
for link in self.summary_links:
update_link(link)
# 更新文档链接字典
for file_path, links in self.md_links.items():
for link in links:
if link.get('is_external') and link.get('file_path') and link.get('file_path').startswith(('http://', 'https://')):
with self.online_link_cache_lock:
is_valid = self.online_link_cache.get(link['file_path'], False)
link['exists'] = is_valid
# 如果链接无效,添加到无效链接列表
if not is_valid and link not in self.invalid_links:
self.invalid_links.append(link)
def generate_reports(self, output_path):
"""
生成两个报告:完整报告和错误链接报告
Args:
output_path: 完整报告输出文件路径
"""
# 生成完整报告
self.generate_full_report(output_path)
# 生成错误链接报告
error_report_path = output_path.replace('.md', '-error.md')
if output_path == error_report_path:
error_report_path = os.path.splitext(output_path)[0] + '-error.md'
self.generate_error_report(error_report_path)
def generate_full_report(self, output_path):
"""
生成包含所有链接的完整报告
Args:
output_path: 输出文件路径
"""
content = "# GitBook链接检查报告完整版\n\n"
# 添加章节标题说明
content += "本报告显示了GitBook文档中的所有链接及其引用的文档。每行的格式为\n"
content += "* [文档标题](文档链接) | [引用的文档1](链接1) | [引用的文档2](链接2) | ...\n\n"
# 跟踪已处理的章节
processed_sections = set()
# 递归生成报告内容
def generate_link_report(link, indent=""):
nonlocal content
# 检查是否有新章节
if 'section' in link and link['section'] and link['section'] not in processed_sections:
content += f"\n## {link['section']}\n\n"
processed_sections.add(link['section'])
# 生成主链接
file_path = link.get('file_path')
status = "" if link.get('exists', False) else ""
# 基本链接信息
content += f"{indent}* [{link['text']}]({link['link']}) {status}"
# 添加该文档中引用的所有非图片链接
if file_path and file_path in self.md_links and self.md_links[file_path]:
referenced_links = self.md_links[file_path]
# 遍历文档中引用的所有链接
for ref_link in referenced_links:
# 跳过图片链接
if 'link' in ref_link and self.is_image_link(ref_link['link']):
continue
ref_status = "" if ref_link.get('exists', False) else ""
content += f" | [{ref_link['text']}]({ref_link['link']}) {ref_status}"
content += "\n"
# 递归处理子链接
for child in link.get('children', []):
generate_link_report(child, indent + " ")
# 处理所有顶级链接
for link in self.summary_links:
generate_link_report(link)
# 保存报告
try:
# 确保输出目录存在
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(output_path, 'w', encoding='utf-8') as file:
file.write(content)
print(f"完整报告已生成: {output_path}")
except Exception as e:
print(f"写入报告时出错: {e}")
def generate_error_report(self, output_path):
"""
生成仅包含错误链接的报告
Args:
output_path: 输出文件路径
"""
if not self.invalid_links:
print(f"没有发现无效链接,不生成错误报告")
return
content = "# GitBook链接检查报告仅错误链接\n\n"
content += "本报告仅显示文档中的无效链接。每行的格式为:\n"
content += "* [文档标题](文档链接) | [无效链接](链接路径) ❌\n\n"
# 按源文件组织无效链接
links_by_source = defaultdict(list)
for link in self.invalid_links:
source = link.get('source_file', 'Unknown')
links_by_source[source].append(link)
# 按源文件添加无效链接
for source, links in sorted(links_by_source.items()):
# 添加源文件标题
content += f"## 来自 {source}\n\n"
# 找到源文件在summary中的对应链接
summary_link = None
# 查找源文件对应的summary链接
for link in self.extract_links_from_summary():
if link.get('file_path') and os.path.relpath(link['file_path'], self.base_dir) == source:
summary_link = link
break
# 如果是SUMMARY.md本身
if source == 'SUMMARY.md':
# 添加每个无效链接
for link in links:
status = ""
content += f"* [{link['text']}]({link['link']}) {status}\n"
else:
# 如果找到了源文件对应的summary链接
if summary_link:
# 显示源文件链接和其中的无效链接
source_status = "" if summary_link.get('exists', False) else ""
content += f"* [{summary_link['text']}]({summary_link['link']}) {source_status}"
# 添加源文件中的无效链接
for link in links:
content += f" | [{link['text']}]({link['link']}) ❌"
content += "\n\n"
else:
# 没有找到源文件对应的summary链接只显示无效链接
for link in links:
content += f"* 来自: {source} - [{link['text']}]({link['link']}) ❌\n"
content += "\n"
# 保存报告
try:
# 确保输出目录存在
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(output_path, 'w', encoding='utf-8') as file:
file.write(content)
print(f"错误报告已生成: {output_path}")
except Exception as e:
print(f"写入错误报告时出错: {e}")
def main():
"""主函数"""
print("=" * 60)
print("多线程版GitBook链接检查器")
print("=" * 60)
# 获取SUMMARY.md文件路径
if len(sys.argv) > 1:
summary_path = sys.argv[1]
else:
summary_path = input("请输入SUMMARY.md文件路径: ").strip()
if not summary_path:
summary_path = os.path.join(os.getcwd(), "SUMMARY.md")
print(f"使用默认路径: {summary_path}")
# 检查文件是否存在
if not os.path.isfile(summary_path):
print(f"错误: 文件 '{summary_path}' 不存在")
sys.exit(1)
# 获取基础目录
base_dir = os.path.dirname(os.path.abspath(summary_path))
if len(sys.argv) > 2:
base_dir = sys.argv[2]
else:
input_base_dir = input(f"请输入文档根目录 [默认: {base_dir}]: ").strip()
if input_base_dir:
base_dir = input_base_dir
# 获取输出文件路径
if len(sys.argv) > 3:
output_path = sys.argv[3]
else:
default_output = os.path.join(base_dir, "link-check-report.md")
output_path = input(f"请输入输出文件路径 [默认: {default_output}]: ").strip()
if not output_path:
output_path = default_output
# 处理目录输出
if os.path.isdir(output_path):
output_path = os.path.join(output_path, "link-check-report.md")
# 询问是否验证在线链接
verify_online = input("是否验证在线链接? (y/n) [默认: n]: ").strip().lower() == 'y'
max_threads = 10
if verify_online:
# 获取最大线程数
try:
max_threads = int(input(f"请输入最大线程数 [默认: 10]: ").strip() or "10")
if max_threads < 1:
max_threads = 10
print(f"线程数必须大于0已设置为默认值10")
except ValueError:
max_threads = 10
print(f"输入无效已设置为默认值10")
print(f"将使用 {max_threads} 个线程并行检查在线链接")
else:
print("未验证的在线链接将被标记为错误,并添加到错误报告中")
start_time = time.time()
try:
# 创建链接检查器并执行检查
checker = LinkChecker(
summary_path=summary_path,
base_dir=base_dir,
verify_online=verify_online,
max_threads=max_threads
)
checker.check_links()
checker.generate_reports(output_path)
# 统计信息
total_files = len(checker.processed_files)
invalid_links = len(checker.invalid_links)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n统计信息:")
print(f"- 检查的文件数: {total_files}")
print(f"- 无效链接数: {invalid_links}")
print(f"- 耗时: {elapsed_time:.2f}")
print("\n检查完成!")
except Exception as e:
print(f"执行过程中出错: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,176 @@
#!/usr/bin/env python3
"""
改进的GitBook Summary链接提取器 (支持目录输出)
此脚本从SUMMARY.md文件中提取所有内容
保留原始的目录结构和标题,
将链接转换为在线URL不包含.md后缀
支持将输出文件放在指定目录中。
"""
import os
import re
import sys
import urllib.parse
def process_summary_file(summary_path, base_url):
"""
处理SUMMARY.md文件保留结构并转换链接
Args:
summary_path: SUMMARY.md文件的路径
base_url: 基础URL
Returns:
processed_content: 处理后的内容
"""
print(f"正在处理 {summary_path}...")
try:
with open(summary_path, 'r', encoding='utf-8') as file:
content = file.read()
except Exception as e:
print(f"读取文件时出错: {e}")
sys.exit(1)
# 确保base_url以/结尾
if not base_url.endswith('/'):
base_url += '/'
# 处理每一行
lines = content.split('\n')
processed_lines = []
for line in lines:
# 提取行中的Markdown链接
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
matches = re.findall(link_pattern, line)
processed_line = line
# 替换每个链接
for text, link in matches:
# 跳过锚点链接
if link.startswith('#'):
continue
# 构建完整URL
if not link.startswith(('http://', 'https://')):
if link.startswith('/'):
link = link[1:]
full_url = urllib.parse.urljoin(base_url, link)
else:
full_url = link
# 移除.md后缀
if full_url.endswith('.md'):
full_url = full_url[:-3]
# 替换链接
original_link = f"[{text}]({link})"
new_link = f"[{text}]({full_url})"
processed_line = processed_line.replace(original_link, new_link)
processed_lines.append(processed_line)
return '\n'.join(processed_lines)
def save_to_markdown(content, output_path):
"""
保存处理后的内容到Markdown文件
Args:
content: 处理后的内容
output_path: 输出文件路径
"""
# 检查路径是否是目录
if os.path.isdir(output_path):
# 如果是目录,在该目录中创建默认文件名
output_file = os.path.join(output_path, "gitbook-urls.md")
else:
# 否则使用提供的路径
output_file = output_path
# 确保输出目录存在
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
try:
os.makedirs(output_dir)
print(f"已创建目录: {output_dir}")
except Exception as e:
print(f"创建目录时出错: {e}")
sys.exit(1)
try:
with open(output_file, 'w', encoding='utf-8') as file:
file.write(content)
print(f"Markdown文件已生成: {output_file}")
except Exception as e:
print(f"写入文件时出错: {e}")
sys.exit(1)
def add_header(content):
"""
向内容添加标题和说明
Args:
content: 原始内容
Returns:
new_content: 添加标题和说明后的内容
"""
header = "# GitBook文档链接\n\n"
header += "以下是从SUMMARY.md提取的文档结构和链接\n\n"
return header + content
if __name__ == "__main__":
print("=" * 60)
print("改进的GitBook Summary链接提取器 (支持目录输出)")
print("=" * 60)
# 获取SUMMARY.md文件路径
if len(sys.argv) > 1:
summary_path = sys.argv[1]
else:
summary_path = input("请输入SUMMARY.md文件路径: ").strip()
if not summary_path:
summary_path = os.path.join(os.getcwd(), "SUMMARY.md")
print(f"使用默认路径: {summary_path}")
# 检查文件是否存在
if not os.path.isfile(summary_path):
print(f"错误: 文件 '{summary_path}' 不存在")
sys.exit(1)
# 获取基础URL
if len(sys.argv) > 2:
base_url = sys.argv[2]
else:
base_url = input("请输入文档基础URL: ").strip()
if not base_url:
base_url = "https://docs.example.com/"
print(f"使用默认URL: {base_url}")
# 获取输出文件路径或目录
if len(sys.argv) > 3:
output_path = sys.argv[3]
else:
default_output = os.path.join(os.path.dirname(summary_path), "gitbook-urls.md")
output_path = input(f"请输入输出文件路径或目录 [默认: {default_output}]: ").strip()
if not output_path:
output_path = default_output
# 处理文件内容
processed_content = process_summary_file(summary_path, base_url)
# 添加标题和说明
final_content = add_header(processed_content)
# 保存到Markdown文件
save_to_markdown(final_content, output_path)
print("\n处理完成!")

View File

@@ -0,0 +1,367 @@
#!/usr/bin/env python3
"""
本地GitBook Markdown文件链接检查工具
此脚本会:
1. 从SUMMARY.md提取所有文档链接
2. 解析每个本地Markdown文件
3. 提取并验证文件中的内部链接
4. 生成链接检查报告
"""
import os
import re
import sys
import csv
from datetime import datetime
from urllib.parse import urlparse, urljoin
# 尝试导入依赖,如果不存在则自动安装
try:
from bs4 import BeautifulSoup
import markdown
except ImportError:
print("正在安装必要依赖...")
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "beautifulsoup4", "markdown"])
from bs4 import BeautifulSoup
import markdown
class GitbookLocalChecker:
"""GitBook本地文件链接检查工具"""
def __init__(self, summary_path, base_dir=None, remove_md=True):
"""
初始化链接检查器
Args:
summary_path: SUMMARY.md文件路径
base_dir: 文档根目录默认为SUMMARY.md所在目录
remove_md: 是否移除.md后缀
"""
self.summary_path = os.path.abspath(summary_path)
self.base_dir = base_dir or os.path.dirname(self.summary_path)
self.remove_md = remove_md
self.all_links = []
self.all_md_files = []
self.invalid_links = []
# 记录解析过的文件,避免重复处理
self.processed_files = set()
def extract_summary_links(self):
"""从SUMMARY.md提取所有Markdown文件链接"""
print(f"正在从 {self.summary_path} 提取文档链接...")
with open(self.summary_path, 'r', encoding='utf-8') as file:
content = file.read()
# 使用正则表达式提取链接
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
matches = re.findall(link_pattern, content)
links = []
for i, (text, link) in enumerate(matches, 1):
# 排除锚点链接
if not link.startswith('#') and link.endswith('.md'):
# 计算本地文件路径
local_path = os.path.normpath(os.path.join(self.base_dir, link))
links.append({
'id': i,
'text': text,
'link': link,
'local_path': local_path,
'exists': os.path.exists(local_path),
'type': 'summary_link',
'source_file': 'SUMMARY.md'
})
# 将文件添加到待处理列表
if os.path.exists(local_path):
self.all_md_files.append(local_path)
print(f"找到 {len(links)} 个文档链接,{len(self.all_md_files)} 个本地Markdown文件")
self.all_links.extend(links)
return links
def process_md_file(self, file_path):
"""处理单个Markdown文件提取其中的链接"""
# 如果文件已处理,跳过
if file_path in self.processed_files:
return []
self.processed_files.add(file_path)
relative_path = os.path.relpath(file_path, self.base_dir)
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# 提取所有链接
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
matches = re.findall(link_pattern, content)
links = []
for text, link in matches:
# 排除外部链接和锚点链接
if link.startswith(('http://', 'https://', '#')):
continue
# 解析相对路径
if link.startswith('/'):
# 从根目录计算
target_path = os.path.normpath(os.path.join(self.base_dir, link.lstrip('/')))
else:
# 从当前文件所在目录计算
target_path = os.path.normpath(os.path.join(os.path.dirname(file_path), link))
# 如果链接没有扩展名但指向目录添加README.md
if not os.path.splitext(target_path)[1]:
if os.path.isdir(target_path):
target_path = os.path.join(target_path, 'README.md')
else:
# 可能是不带扩展名的文件引用,添加.md
target_path += '.md'
# 检查链接是否有效
exists = os.path.exists(target_path)
link_info = {
'text': text,
'link': link,
'local_path': target_path,
'target_file': os.path.basename(target_path),
'exists': exists,
'type': 'internal_link',
'source_file': relative_path
}
links.append(link_info)
# 如果链接无效,添加到无效链接列表
if not exists:
self.invalid_links.append(link_info)
# 如果是有效的Markdown文件且尚未处理添加到待处理列表
elif target_path.endswith('.md') and target_path not in self.processed_files:
self.all_md_files.append(target_path)
return links
except Exception as e:
print(f"处理文件 {file_path} 时出错: {e}")
return []
def process_all_files(self):
"""处理所有Markdown文件"""
print("开始处理所有Markdown文件...")
# 先提取SUMMARY.md中的链接
self.extract_summary_links()
# 处理所有Markdown文件
files_to_process = list(self.all_md_files) # 创建副本,因为处理过程中会添加新文件
processed_count = 0
for file_path in files_to_process:
if file_path not in self.processed_files:
relative_path = os.path.relpath(file_path, self.base_dir)
print(f"处理文件: {relative_path}")
links = self.process_md_file(file_path)
self.all_links.extend(links)
processed_count += 1
# 如果发现新文件,可能需要处理它们
new_files = [f for f in self.all_md_files if f not in files_to_process and f not in self.processed_files]
files_to_process.extend(new_files)
print(f"已处理 {processed_count} 个Markdown文件")
print(f"共找到 {len(self.all_links)} 个链接,其中 {len(self.invalid_links)} 个无效")
def generate_markdown_report(self, output_path):
"""生成Markdown格式的报告"""
print(f"正在生成报告: {output_path}")
content = f"""# GitBook本地链接检查报告
## 摘要
- 检查时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- 处理文件数: {len(self.processed_files)}
- 总链接数: {len(self.all_links)}
- 无效链接数: {len(self.invalid_links)}
## 无效链接列表
"""
# 按源文件分组显示无效链接
grouped_links = {}
for link in self.invalid_links:
source = link['source_file']
if source not in grouped_links:
grouped_links[source] = []
grouped_links[source].append(link)
for source, links in sorted(grouped_links.items()):
content += f"\n### 文件: {source}\n"
for link in links:
content += f"- [{link['text']}]({link['link']}) -> {link['local_path']} (无效)\n"
# 添加所有文件的链接统计
content += "\n## 文件链接统计\n"
file_stats = {}
for link in self.all_links:
source = link['source_file']
if source not in file_stats:
file_stats[source] = {'total': 0, 'invalid': 0}
file_stats[source]['total'] += 1
if not link['exists']:
file_stats[source]['invalid'] += 1
for source, stats in sorted(file_stats.items()):
content += f"- {source}: 共 {stats['total']} 个链接,{stats['invalid']} 个无效\n"
with open(output_path, 'w', encoding='utf-8') as file:
file.write(content)
print(f"报告已生成: {output_path}")
def generate_csv_report(self, output_path):
"""生成CSV格式的报告"""
print(f"正在生成CSV报告: {output_path}")
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['source_file', 'text', 'link', 'local_path', 'exists', 'type']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for link in self.all_links:
writer.writerow({
'source_file': link['source_file'],
'text': link['text'],
'link': link['link'],
'local_path': link['local_path'],
'exists': link['exists'],
'type': link['type']
})
print(f"CSV报告已生成: {output_path}")
def get_input_with_default(prompt, default=None):
"""获取用户输入,如果为空则使用默认值"""
if default:
user_input = input(f"{prompt} [{default}]: ")
return user_input if user_input.strip() else default
else:
return input(f"{prompt}: ")
def get_yes_no_input(prompt, default="y"):
"""获取用户是/否输入"""
valid_responses = {
'y': True, 'yes': True, '': True,
'n': False, 'no': False, '': False
}
if default.lower() in ['y', 'yes', '']:
prompt = f"{prompt} [Y/n]: "
default_value = True
else:
prompt = f"{prompt} [y/N]: "
default_value = False
user_input = input(prompt).lower()
if not user_input:
return default_value
return valid_responses.get(user_input, default_value)
def main():
"""主函数,交互式获取输入"""
print("=" * 60)
print("本地GitBook Markdown文件链接检查工具")
print("=" * 60)
# 获取SUMMARY.md文件路径
while True:
summary_path = get_input_with_default(
"请输入SUMMARY.md文件路径",
os.path.join(os.getcwd(), "SUMMARY.md")
)
# 检查文件是否存在
if os.path.isfile(summary_path):
break
else:
print(f"错误: 文件 '{summary_path}' 不存在")
# 获取文档根目录
default_base_dir = os.path.dirname(os.path.abspath(summary_path))
base_dir = get_input_with_default(
"请输入文档根目录(包含所有Markdown文件的目录)",
default_base_dir
)
# 获取输出目录
output_dir = get_input_with_default(
"请输入输出目录",
os.path.dirname(summary_path) or os.getcwd()
)
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 生成文件路径
report_path = os.path.join(output_dir, "gitbook-links-report.md")
csv_path = os.path.join(output_dir, "gitbook-links-report.csv")
# 询问是否移除.md后缀
remove_md = get_yes_no_input("是否移除链接中的.md后缀", "y")
try:
# 创建检查器实例
checker = GitbookLocalChecker(
summary_path=summary_path,
base_dir=base_dir,
remove_md=remove_md
)
# 处理所有文件
checker.process_all_files()
# 生成报告
checker.generate_markdown_report(report_path)
checker.generate_csv_report(csv_path)
print("\n检查完成!")
print(f"Markdown报告: {report_path}")
print(f"CSV报告: {csv_path}")
# 显示摘要
print(f"\n摘要:")
print(f"- 处理文件数: {len(checker.processed_files)}")
print(f"- 总链接数: {len(checker.all_links)}")
print(f"- 无效链接数: {len(checker.invalid_links)}")
if checker.invalid_links:
print("\n无效链接示例:")
for i, link in enumerate(checker.invalid_links[:5], 1):
print(f"{i}. 文件 '{link['source_file']}' 中 [{link['text']}]({link['link']}) -> {link['local_path']} (无效)")
if len(checker.invalid_links) > 5:
print(f"... 以及其他 {len(checker.invalid_links) - 5} 个无效链接")
except Exception as e:
print(f"执行过程中出错: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -3,6 +3,7 @@
import os
import re
import shutil
from pathlib import Path
import logging
@@ -18,7 +19,9 @@ logging.basicConfig(
logger = logging.getLogger("md-to-mdx")
class MarkdownToMDXConverter:
def __init__(self):
def __init__(self, backup=True, in_place=False):
self.backup = backup
self.in_place = in_place
self.conversion_count = 0
self.error_count = 0
self.base_output_dir = None
@@ -31,90 +34,351 @@ class MarkdownToMDXConverter:
logger.error(f"输入目录不存在: {input_dir}")
return
if self.base_output_dir is None and output_dir:
# 保存基础输出目录,用于构建子目录输出路径
if not self.in_place and self.base_output_dir is None and output_dir:
self.base_output_dir = Path(output_dir)
self.base_input_dir = input_path
self.base_output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"创建基础输出目录: {self.base_output_dir}")
for file in input_path.glob("*.md"):
if self.base_output_dir:
rel_path = file.parent.relative_to(self.base_input_dir) if file.parent != self.base_input_dir else Path('')
target_dir = self.base_output_dir / rel_path
target_dir.mkdir(parents=True, exist_ok=True)
self._process_file(file, target_dir)
# 处理当前目录中的所有.md和.mdx文件
for file in list(input_path.glob("*.md")) + list(input_path.glob("*.mdx")):
if self.in_place:
# 在原位置处理
self._process_file(file, file.parent, delete_original=True)
else:
self._process_file(file, file.parent)
# 计算相对于基础输入目录的路径
if self.base_output_dir:
rel_path = file.parent.relative_to(self.base_input_dir) if file.parent != self.base_input_dir else Path('')
target_dir = self.base_output_dir / rel_path
target_dir.mkdir(parents=True, exist_ok=True)
self._process_file(file, target_dir)
else:
# 如果没有基础输出目录,则就地处理
self._process_file(file, file.parent)
# 如果需要递归处理子目录
if recursive:
for subdir in [d for d in input_path.iterdir() if d.is_dir()]:
# 跳过output目录避免重复处理
if subdir.name == "output" or subdir.name.startswith('.'):
continue
self.process_directory(subdir, output_dir, recursive)
def _process_file(self, file_path, output_dir):
def _process_file(self, file_path, output_dir, delete_original=False):
"""处理单个Markdown文件"""
try:
logger.info(f"处理文件: {file_path}")
# 备份原始文件(如果需要)
if self.backup:
backup_file = str(file_path) + ".bak"
if not os.path.exists(backup_file):
shutil.copy2(file_path, backup_file)
logger.info(f"已创建备份: {backup_file}")
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
content = self._fix_broken_text(content)
content = self._convert_images(content)
content = self._convert_hints(content)
# 执行转换
converted_content = self.convert_content(content)
# 确定输出文件路径
output_file = output_dir / (file_path.stem + ".mdx")
# 写入转换后的内容
with open(output_file, 'w', encoding='utf-8') as f:
f.write(converted_content)
logger.info(f"转换完成: {output_file}")
self.conversion_count += 1
# 如果需要,删除原始文件
if delete_original:
try:
os.remove(file_path)
logger.info(f"已删除源文件: {file_path}")
except Exception as e:
logger.error(f"删除源文件 {file_path} 失败: {str(e)}")
except Exception as e:
logger.error(f"处理文件 {file_path} 时出错: {str(e)}")
self.error_count += 1
def _fix_broken_text(self, content):
"""修复文本中的割裂问题,特别是在代码块周围"""
broken_code_pattern = re.compile(r'```([a-zA-Z]*)\r?\n(.*?)\r?\n```([a-zA-Z]*)', re.DOTALL)
content = broken_code_pattern.sub(r'```\1\n\2\n```', content)
return content
def _convert_images(self, content):
"""转换HTML图片格式为Markdown或MDX格式"""
# 转换没有标题的 <figure><img> 结构
img_pattern_no_caption = re.compile(r'<figure>\s*<img src="([^"]+)" alt="([^"]*)">\s*<figcaption></figcaption>\s*</figure>', re.DOTALL)
content = img_pattern_no_caption.sub(r'![](\1)', content)
# 转换带标题的 <figure><img> 结构
img_pattern_with_caption = re.compile(r'<figure>\s*<img src="([^"]+)" alt="([^"]*)">\s*<figcaption><p>(.*?)</p></figcaption>\s*</figure>', re.DOTALL)
def img_replacer(match):
img_src = match.group(1)
alt_text = match.group(3).strip()
return f'![{alt_text}]({img_src})'
content = img_pattern_with_caption.sub(img_replacer, content)
return content
def _convert_hints(self, content):
"""转换 hint 提示框"""
hint_pattern = re.compile(r'{%\s*hint\s*style="info"\s*%}\s*{%\s*endhint\s*%}', re.DOTALL)
content = hint_pattern.sub(r'<Info>\n</Info>', content)
return content
def convert_content(self, content):
"""将Gitbook Markdown内容转换为Mintlify MDX格式"""
# 1. 转换文档开头的h1元素为frontmatter
h1_pattern = re.compile(r'^#\s+(.+?)$', re.MULTILINE)
match = h1_pattern.search(content)
if match:
title = match.group(1).strip()
content = h1_pattern.sub(f'---\ntitle: {title}\n---\n', content, count=1)
# 2. 转换hint提示框
hint_pattern = re.compile(
r'{%\s*hint\s+style="(\w+)"\s*%}(.*?){%\s*endhint\s*%}',
re.DOTALL
)
def hint_replacer(match):
style = match.group(1)
text = match.group(2).strip()
component_name = style.capitalize() if style != "info" else "Info"
return f'<{component_name}>\n{text}\n</{component_name}>'
content = hint_pattern.sub(hint_replacer, content)
# 3. 转换卡片链接
card_pattern = re.compile(
r'{%\s*content-ref\s+url="([^"]+)"\s*%}\s*\[([^\]]+)\]\(([^)]+)\)\s*{%\s*endcontent-ref\s*%}',
re.DOTALL
)
def card_replacer(match):
url = match.group(1)
title = match.group(2)
return f'<Card title="{title}" icon="link" href="{url}">\n {title}\n</Card>'
content = card_pattern.sub(card_replacer, content)
# 4. 转换并排图片样式
# 寻找连续的图片并转换为并排布局
img_pattern = re.compile(r'!\[(.*?)\]\((.*?)\)\s*!\[(.*?)\]\((.*?)\)', re.DOTALL)
def img_side_replacer(match):
alt1 = match.group(1) or "Image 1"
src1 = match.group(2)
alt2 = match.group(3) or "Image 2"
src2 = match.group(4)
return f'''<div class="image-side-by-side">
<figure>
<img src="{src1}" alt="{alt1}" />
</figure>
<figure>
<img src="{src2}" alt="{alt2}" />
</figure>
</div>'''
content = img_pattern.sub(img_side_replacer, content)
# 5. 转换Frame包装的图片
frame_pattern = re.compile(r'<Frame>\s*<img\s+src="([^"]+)"\s+alt="([^"]+)"\s*/>\s*</Frame>', re.DOTALL)
def frame_replacer(match):
src = match.group(1)
alt = match.group(2)
return f'![{alt}]({src})'
content = frame_pattern.sub(frame_replacer, content)
# 5.1 转换<figure><img>格式的带有宽度和figcaption的图片为特定格式
figure_img_width_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s+width="(\d+)"\s*/?>\s*<figcaption>(?:<p>)?(.*?)(?:</p>)?</figcaption>\s*</figure>', re.DOTALL)
def figure_img_width_caption_replacer(match):
src = match.group(1)
alt = match.group(2) or ""
width = match.group(3)
caption = match.group(4).strip()
# 如果有caption将其添加到alt中
if caption:
alt = caption
return f'''<img
src="{src}"
width="{width}"
className="mx-auto"
alt="{alt}"
/>'''
content = figure_img_width_caption_pattern.sub(figure_img_width_caption_replacer, content)
# 5.2 转换<figure><img>格式的带有宽度但没有figcaption的图片
figure_img_width_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s+width="(\d+)"\s*/?>\s*</figure>', re.DOTALL)
def figure_img_width_replacer(match):
src = match.group(1)
alt = match.group(2) or ""
width = match.group(3)
return f'''<img
src="{src}"
width="{width}"
className="mx-auto"
alt="{alt}"
/>'''
content = figure_img_width_pattern.sub(figure_img_width_replacer, content)
# 5.3 转换<figure><img>格式的没有宽度但有figcaption的图片
figure_img_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s*/?>\s*<figcaption>(?:<p>)?(.*?)(?:</p>)?</figcaption>\s*</figure>', re.DOTALL)
def figure_img_caption_replacer(match):
src = match.group(1)
alt = match.group(2) or ""
caption = match.group(3).strip()
# 如果有caption将其添加到alt中
if caption:
alt = caption
return f'''<img
src="{src}"
className="mx-auto"
alt="{alt}"
/>'''
content = figure_img_caption_pattern.sub(figure_img_caption_replacer, content)
# 5.4 处理没有figcaption和宽度的<figure><img>标签
figure_img_no_caption_pattern = re.compile(r'<figure>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"\s*/?>\s*</figure>', re.DOTALL)
def figure_img_no_caption_replacer(match):
src = match.group(1)
alt = match.group(2) or ""
return f'''<img
src="{src}"
className="mx-auto"
alt="{alt}"
/>'''
content = figure_img_no_caption_pattern.sub(figure_img_no_caption_replacer, content)
# 6. 转换Tabs组件
# 先匹配整个tabs块
tabs_pattern = re.compile(
r'{%\s*tabs\s*%}(.*?){%\s*endtabs\s*%}',
re.DOTALL
)
def tabs_replacer(match):
tabs_content = match.group(1)
# 匹配每个tab
tab_pattern = re.compile(
r'{%\s*tab\s+title="([^"]+)"\s*%}(.*?){%\s*endtab\s*%}',
re.DOTALL
)
# 构建新的Tabs组件
tabs_start = "<Tabs>"
tabs_items = []
for tab_match in tab_pattern.finditer(tabs_content):
title = tab_match.group(1)
content = tab_match.group(2).strip()
tabs_items.append(f' <Tab title="{title}">\n {content}\n </Tab>')
tabs_end = "</Tabs>"
return tabs_start + "\n" + "\n".join(tabs_items) + "\n" + tabs_end
content = tabs_pattern.sub(tabs_replacer, content)
# 7. 处理有限制大小的独立img标签
img_size_pattern = re.compile(r'<img\s+src="([^"]+)"\s+width="(\d+)"(?:\s+alt="([^"]*)")?\s*/>', re.DOTALL)
def img_size_replacer(match):
src = match.group(1)
width = match.group(2)
alt = match.group(3) if match.group(3) else ""
return f'''<img
src="{src}"
width="{width}"
className="mx-auto"
alt="{alt}"
/>'''
content = img_size_pattern.sub(img_size_replacer, content)
# 7.1 处理各种形式的独立<img>标签
standalone_img_pattern = re.compile(r'<img\s+src="([^"]+)"(?:\s+alt="([^"]*)")?[^>]*>', re.DOTALL)
def standalone_img_replacer(match):
src = match.group(1)
alt = match.group(2) if match.group(2) else ""
return f'''<img
src="{src}"
className="mx-auto"
alt="{alt}"
/>'''
content = standalone_img_pattern.sub(standalone_img_replacer, content)
# 8. 将markdown表格转换为MDX表格格式
# 使用正则表达式匹配markdown表格
table_pattern = re.compile(r'(\|.*\|\n\|[-:\s|]*\|\n(?:\|.*\|\n)+)', re.MULTILINE)
def table_replacer(match):
md_table = match.group(1)
lines = md_table.strip().split('\n')
# 提取表头和表体
header_row = lines[0]
header_cells = [cell.strip() for cell in header_row.split('|')[1:-1]]
# 忽略分隔行
body_rows = lines[2:]
body_cells_rows = []
for row in body_rows:
cells = [cell.strip() for cell in row.split('|')[1:-1]]
body_cells_rows.append(cells)
# 按照要求的格式构建MDX表格
mdx_table = "<table>\n <thead>\n <tr>\n"
# 添加表头
for cell in header_cells:
mdx_table += f" <th>{cell}</th>\n"
mdx_table += " </tr>\n </thead>\n <tbody>\n"
# 添加表体
for row_cells in body_cells_rows:
mdx_table += " <tr>\n"
for cell in row_cells:
# 先转换Markdown链接为HTML链接
# 匹配 [text](url) 格式
link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
cell = link_pattern.sub(r'<a href="\2">\1</a>', cell)
# 替换<br>标签为</p><p>,实现正确的段落分隔
# 先处理<br>标签(可能有不同形式:<br>, <br/>, <br />
br_pattern = re.compile(r'<br\s*/?>')
# 处理单元格中的<p>和<br>标签
if '<p>' in cell or br_pattern.search(cell):
# 如果已有<p>标签但包含<br>,替换<br>为</p><p>
if '<p>' in cell and br_pattern.search(cell):
cell = br_pattern.sub(r'</p>\n <p>', cell)
# 清理末尾的空<br>标签
cell = re.sub(r'<br\s*/?>(\s*</p>)', r'\1', cell)
# 如果没有<p>标签但有<br>,用<p>标签包装每个段落
elif br_pattern.search(cell) and not '<p>' in cell:
paragraphs = br_pattern.split(cell)
cell = '<p>' + '</p>\n <p>'.join([p.strip() for p in paragraphs if p.strip()]) + '</p>'
# 确保缩进正确
mdx_table += f" <td>\n {cell}\n </td>\n"
else:
# 普通文本单元格
mdx_table += f" <td>{cell}</td>\n"
mdx_table += " </tr>\n"
mdx_table += " </tbody>\n</table>"
return mdx_table
content = table_pattern.sub(table_replacer, content)
return content
def get_statistics(self):
"""返回处理统计信息"""
return {
@@ -127,6 +391,7 @@ def main():
print("Gitbook Markdown 转 Mintlify MDX 转换工具")
print("=" * 60)
# 通过交互方式获取输入路径
input_path_str = input("请输入源文件或目录路径: ")
input_path = Path(input_path_str)
@@ -134,34 +399,54 @@ def main():
print(f"错误: 路径 '{input_path_str}' 不存在!")
return
# 询问是否递归处理子目录
recursive = False
if input_path.is_dir():
recursive_input = input("是否递归处理所有子目录? (y/n): ").lower()
recursive = recursive_input in ('y', 'yes')
if input_path.is_file():
output_dir = input_path.parent / "output"
else:
output_dir = input_path / "output"
# 询问是否创建备份
backup_input = input("是否创建备份文件? (y/n, 默认:y): ").lower()
create_backup = backup_input in ('', 'y', 'yes')
converter = MarkdownToMDXConverter()
# 询问是否原地转换并删除源文件
in_place_input = input("是否在原地转换并删除源文件? (y/n, 默认:n): ").lower()
in_place = in_place_input in ('y', 'yes')
if input_path.is_file() and input_path.suffix.lower() == '.md':
# 确定输出目录
output_dir = None
if not in_place:
if input_path.is_file():
output_dir = input_path.parent / "output"
else:
output_dir = input_path / "output"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"输出目录已创建: {output_dir}")
converter._process_file(input_path, output_dir)
# 创建转换器并处理文件
converter = MarkdownToMDXConverter(backup=create_backup, in_place=in_place)
if input_path.is_file() and input_path.suffix.lower() == '.md':
# 处理单个文件
if in_place:
converter._process_file(input_path, input_path.parent, delete_original=True)
else:
converter._process_file(input_path, output_dir)
elif input_path.is_dir():
# 处理目录
converter.process_directory(input_path, output_dir, recursive)
else:
logger.error(f"无效的输入路径: {input_path_str}")
print(f"错误: '{input_path_str}' 不是有效的Markdown文件或目录!")
return
# 打印统计信息
stats = converter.get_statistics()
print("=" * 60)
print(f"转换完成! 成功转换: {stats['conversion_count']}个文件, 错误: {stats['error_count']}个文件")
print(f"转换结果已保存至: {output_dir}")
if not in_place and output_dir:
print(f"转换结果已保存至: {output_dir}")
print("=" * 60)
if __name__ == "__main__":
main()
main()