#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
交互式图片路径修复工具
该脚本可以交互式地查找并修复 .mdx 文件中的相对图片路径,
将它们替换为原始 dify-docs 仓库中对应的在线 URL。
每次找到一个问题时,会显示详细信息并等待用户确认后再进行修改。
目录映射关系:
dify-docs -> dify-docs
zh-hans -> zh_CN
en -> en
ja-jp -> jp
"""
import os
import re
import sys
from pathlib import Path
import argparse
from colorama import init, Fore, Style
import difflib
# 初始化 colorama
init(autoreset=True)
# 查找图片的正则表达式
MD_IMAGE_RE = re.compile(r'!\[(.*?)\]\(((?!https?://|/).+?\.(png|jpe?g|gif|svg))\)')
HTML_IMAGE_RE = re.compile(r'
]*src=["\']([^"\']+\.(png|jpe?g|gif|svg))["\'][^>]*>')
FRAME_IMAGE_RE = re.compile(r'
]*src=["\'](/[^"\']+\.(png|jpe?g|gif|svg))["\'][^>]*>')
# 查找在线URL的正则表达式
MD_ONLINE_URL_RE = re.compile(r'!\[[^\]]*\]\((https://[^\s\)]+\.(png|jpe?g|gif|svg|jpeg))\)')
HTML_ONLINE_URL_RE = re.compile(r'
]*src=["\'](https://[^\s"\']+\.(png|jpe?g|gif|svg|jpeg))["\'][^>]*>')
# 语言目录映射
LANGUAGE_MAPPING = {
'zh-hans': 'zh_CN',
'en': 'en',
'ja-jp': 'jp'
}
REVERSE_LANGUAGE_MAPPING = {v: k for k, v in LANGUAGE_MAPPING.items()}
def find_relative_images(file_path):
"""
在 .mdx 文件中查找所有相对路径的图片。
返回一个包含 (匹配文本, 图片路径, 行号, 位置) 的列表
"""
relative_images = []
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 检查 Markdown 图片语法
for match in MD_IMAGE_RE.finditer(content):
image_path = match.group(2)
if not image_path.startswith(('http://', 'https://', '/')):
# 记录行号和位置
line_no = content[:match.start()].count('\n') + 1
position = match.start()
relative_images.append((match.group(0), image_path, line_no, position))
# 检查 HTML img 标签
for match in HTML_IMAGE_RE.finditer(content):
image_path = match.group(1)
if not image_path.startswith(('http://', 'https://', '/')):
line_no = content[:match.start()].count('\n') + 1
position = match.start()
relative_images.append((match.group(0), image_path, line_no, position))
# 检查 Frame 组件中的相对路径
for match in FRAME_IMAGE_RE.finditer(content):
image_path = match.group(1)
# 如 /ja-jp/img/... 或 /en-us/img/... 或 /zh-hans/... 这样的路径
if image_path.startswith('/'):
line_no = content[:match.start()].count('\n') + 1
position = match.start()
relative_images.append((match.group(0), image_path, line_no, position))
# 按位置排序,确保按照文档中的顺序处理
relative_images.sort(key=lambda x: x[3])
# 返回时去掉位置信息
return [(match, path, line) for match, path, line, _ in relative_images]
def parse_md_file_for_urls(file_path):
"""仔细解析Markdown文件以提取在线URL和它们的位置"""
urls = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.split('\n')
# 查找图片URLs
for i, line in enumerate(lines):
# 查找Markdown风格的图片
for match in MD_ONLINE_URL_RE.finditer(line):
url = match.group(1)
position = sum(len(l) + 1 for l in lines[:i]) + match.start()
urls.append((url, i+1, position))
# 查找HTML风格的图片
for match in HTML_ONLINE_URL_RE.finditer(line):
url = match.group(1)
position = sum(len(l) + 1 for l in lines[:i]) + match.start()
urls.append((url, i+1, position))
# 按文档中的位置排序
urls.sort(key=lambda x: x[2])
return urls
except Exception as e:
print(f"{Fore.RED}读取文件 {file_path} 时出错: {e}")
return []
def find_corresponding_file(mintlify_file, mintlify_base, dify_base):
"""查找 dify-docs 仓库中对应的文件"""
# 获取从 mintlify_base 到文件的相对路径
rel_path = os.path.relpath(mintlify_file, mintlify_base)
# 提取语言文件夹(路径的第一个组件)
parts = rel_path.split(os.sep)
if len(parts) > 0 and parts[0] in LANGUAGE_MAPPING:
lang_folder = parts[0]
mapped_lang = LANGUAGE_MAPPING[lang_folder]
parts[0] = mapped_lang
# 用映射后的语言重建路径
rel_path = os.path.join(*parts)
# 将扩展名从 .mdx 改为 .md
if rel_path.endswith('.mdx'):
rel_path = rel_path[:-4] + '.md'
# 构建在 dify-docs 中的完整路径
dify_file = os.path.join(dify_base, rel_path)
return dify_file if os.path.exists(dify_file) else None
def extract_img_basename(path):
"""从图片路径中提取基本文件名"""
# 处理常规和语言前缀路径
# 如 /ja-jp/img/jp-env-variable.png 或 /en-us/img/image.png
if path.startswith('/'):
parts = path.split('/')
if len(parts) > 1:
return parts[-1] # 获取最后一部分(文件名)
return os.path.basename(path)
def get_file_extension(path):
"""获取文件扩展名"""
basename = extract_img_basename(path)
if '.' in basename:
return basename.split('.')[-1].lower()
return None
def debug_print_file_comparison(mintlify_file, dify_file):
"""打印两个文件的内容对比,用于调试"""
print(f"\n{Fore.CYAN}======= 文件内容对比 =======")
# 读取mintlify文件内容
try:
with open(mintlify_file, 'r', encoding='utf-8') as f:
mintlify_content = f.read()
print(f"\n{Fore.GREEN}Mintlify文件({mintlify_file}):")
print(mintlify_content[:500] + "..." if len(mintlify_content) > 500 else mintlify_content)
except Exception as e:
print(f"{Fore.RED}读取 {mintlify_file} 错误: {e}")
# 读取dify文件内容
try:
with open(dify_file, 'r', encoding='utf-8') as f:
dify_content = f.read()
print(f"\n{Fore.YELLOW}Dify文件({dify_file}):")
print(dify_content[:500] + "..." if len(dify_content) > 500 else dify_content)
except Exception as e:
print(f"{Fore.RED}读取 {dify_file} 错误: {e}")
# 提取并比较图片URLs
mintlify_images = find_relative_images(mintlify_file)
dify_urls = parse_md_file_for_urls(dify_file)
print(f"\n{Fore.CYAN}Mintlify图片({len(mintlify_images)}):")
for i, (_, img_path, _) in enumerate(mintlify_images):
print(f"{i+1}. {img_path}")
print(f"\n{Fore.CYAN}Dify URLs({len(dify_urls)}):")
for i, (url, _, _) in enumerate(dify_urls):
print(f"{i+1}. {url}")
def match_images_precisely(mintlify_images, dify_file):
"""精确匹配图片,按照文档位置和上下文"""
results = []
dify_urls = parse_md_file_for_urls(dify_file)
# 按照位置对应匹配图片和URL
for i, (match_text, img_path, line_no) in enumerate(mintlify_images):
img_ext = get_file_extension(img_path)
# 检查是否有足够的URLs
if i < len(dify_urls):
# 检查扩展名是否匹配
url_ext = get_file_extension(dify_urls[i][0])
if img_ext and url_ext and img_ext.lower() == url_ext.lower():
results.append((match_text, img_path, line_no, dify_urls[i][0], dify_file))
else:
# 尝试在剩余URL中找到匹配的扩展名
found_match = False
for j, (url, _, _) in enumerate(dify_urls):
if j != i: # 避免重复使用当前位置
j_ext = get_file_extension(url)
if img_ext and j_ext and img_ext.lower() == j_ext.lower():
results.append((match_text, img_path, line_no, url, dify_file))
found_match = True
break
if not found_match:
results.append((match_text, img_path, line_no, None, None))
else:
results.append((match_text, img_path, line_no, None, None))
return results
def get_all_content_after_image(content, image_path):
"""获取图片后的所有文本内容"""
# 查找相对路径的位置
index = content.find(image_path)
if index == -1:
return ""
# 返回图片后的所有内容
return content[index + len(image_path):]
def match_images_by_name_and_context(mintlify_file, dify_file):
"""通过图片名称和上下文匹配图片"""
try:
# 读取文件内容
with open(mintlify_file, 'r', encoding='utf-8') as f:
mintlify_content = f.read()
with open(dify_file, 'r', encoding='utf-8') as f:
dify_content = f.read()
# 获取mintlify文件中的相对图片
mintlify_images = find_relative_images(mintlify_file)
# 提取dify文件中的在线URLs
dify_urls = parse_md_file_for_urls(dify_file)
# 按顺序匹配每个图片
results = []
for match_text, img_path, line_no in mintlify_images:
# 提取图片名称和后缀
img_base = extract_img_basename(img_path)
img_ext = get_file_extension(img_path)
# 获取图片在mintlify文件中的实际位置
img_index = mintlify_content.find(img_path)
if img_index == -1:
img_index = mintlify_content.find(match_text)
# 获取图片后的文本上下文(用于更精确的匹配)
after_text = mintlify_content[img_index + len(match_text):img_index + len(match_text) + 200]
after_text = re.sub(r'[\n\r\s]+', ' ', after_text).strip()
# 尝试通过图片在文档中的位置和顺序匹配
# 首先,确定当前图片是这种类型的第几个图片
same_ext_images = [i for i, (_, p, _) in enumerate(mintlify_images)
if get_file_extension(p) == img_ext]
current_index = same_ext_images.index(mintlify_images.index((match_text, img_path, line_no)))
# 获取对应索引的相同类型URL
same_ext_urls = [(i, u) for i, (u, _, _) in enumerate(dify_urls)
if get_file_extension(u) == img_ext]
if current_index < len(same_ext_urls):
# 按顺序匹配
url_index, url = same_ext_urls[current_index]
results.append((match_text, img_path, line_no, url, dify_file))
else:
# 如果顺序匹配失败,尝试上下文相似度匹配
best_match = None
best_score = 0
for _, (url, url_line, _) in enumerate(dify_urls):
# 检查扩展名是否匹配
url_ext = get_file_extension(url)
if img_ext != url_ext:
continue
# 获取URL在dify文件中的实际位置
url_index = dify_content.find(url)
# 获取URL后的文本上下文
url_after_text = dify_content[url_index + len(url):url_index + len(url) + 200]
url_after_text = re.sub(r'[\n\r\s]+', ' ', url_after_text).strip()
# 计算上下文相似度
matcher = difflib.SequenceMatcher(None, after_text, url_after_text)
score = matcher.ratio()
if score > best_score:
best_score = score
best_match = url
if best_match:
results.append((match_text, img_path, line_no, best_match, dify_file))
else:
results.append((match_text, img_path, line_no, None, None))
return results
except Exception as e:
print(f"{Fore.RED}匹配图片时出错: {e}")
return []
def find_matching_image_url(mintlify_file, dify_file, img_path, order_index=0):
"""查找匹配的图片URL,考虑多种策略"""
# 策略1: 按顺序匹配
# 策略2: 按扩展名匹配
# 策略3: 按上下文匹配
try:
# 获取图片扩展名
img_ext = get_file_extension(img_path)
if not img_ext:
return None
# 解析dify文件中的URLs
urls = parse_md_file_for_urls(dify_file)
# 按顺序和扩展名匹配
# 找出所有扩展名匹配的URLs
matching_urls = []
for url, _, _ in urls:
url_ext = get_file_extension(url)
if url_ext == img_ext:
matching_urls.append(url)
# 如果没有匹配的URL,返回None
if not matching_urls:
return None
# 如果图片序号在有效范围内,按序号匹配
if order_index < len(matching_urls):
return matching_urls[order_index]
# 否则返回第一个匹配的URL
return matching_urls[0]
except Exception as e:
print(f"{Fore.RED}查找匹配URL时出错: {e}")
return None
def validate_content_alignment(mintlify_file, dify_file, img_path, url, check_text_length=200):
"""验证内容对齐情况,确保图片周围的内容相似"""
try:
with open(mintlify_file, 'r', encoding='utf-8') as f:
mintlify_content = f.read()
with open(dify_file, 'r', encoding='utf-8') as f:
dify_content = f.read()
# 在原始文件中找到图片位置
img_index = mintlify_content.find(img_path)
if img_index == -1:
return False
# 在目标文件中找到URL位置
url_index = dify_content.find(url)
if url_index == -1:
return False
# 获取图片后的文本
img_after = mintlify_content[img_index + len(img_path):img_index + len(img_path) + check_text_length]
img_after = re.sub(r'[\n\r\s]+', ' ', img_after).strip()
# 获取URL后的文本
url_after = dify_content[url_index + len(url):url_index + len(url) + check_text_length]
url_after = re.sub(r'[\n\r\s]+', ' ', url_after).strip()
# 计算相似度
matcher = difflib.SequenceMatcher(None, img_after, url_after)
ratio = matcher.ratio()
# 返回相似度是否超过阈值
return ratio > 0.5
except Exception as e:
print(f"{Fore.RED}验证内容对齐时出错: {e}")
return False
def replace_image_in_file(file_path, match_text, online_url):
"""在文件中将单个相对图片路径替换为在线 URL"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
new_content = content
if '![]()