#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 交互式图片路径修复工具 该脚本可以交互式地查找并修复 .mdx 文件中的相对图片路径, 将它们替换为原始 dify-docs 仓库中对应的在线 URL。 每次找到一个问题时,会显示详细信息并等待用户确认后再进行修改。 目录映射关系: dify-docs-mintlify -> dify-docs zh-hans -> zh_CN en -> en ja-jp -> jp """ import os import re import sys from pathlib import Path import argparse from colorama import init, Fore, Style import difflib # 初始化 colorama init(autoreset=True) # 查找图片的正则表达式 MD_IMAGE_RE = re.compile(r'!\[(.*?)\]\(((?!https?://|/).+?\.(png|jpe?g|gif|svg))\)') HTML_IMAGE_RE = re.compile(r']*src=["\']([^"\']+\.(png|jpe?g|gif|svg))["\'][^>]*>') FRAME_IMAGE_RE = re.compile(r']*src=["\'](/[^"\']+\.(png|jpe?g|gif|svg))["\'][^>]*>') # 查找在线URL的正则表达式 MD_ONLINE_URL_RE = re.compile(r'!\[[^\]]*\]\((https://[^\s\)]+\.(png|jpe?g|gif|svg|jpeg))\)') HTML_ONLINE_URL_RE = re.compile(r']*src=["\'](https://[^\s"\']+\.(png|jpe?g|gif|svg|jpeg))["\'][^>]*>') # 语言目录映射 LANGUAGE_MAPPING = { 'zh-hans': 'zh_CN', 'en': 'en', 'ja-jp': 'jp' } REVERSE_LANGUAGE_MAPPING = {v: k for k, v in LANGUAGE_MAPPING.items()} def find_relative_images(file_path): """ 在 .mdx 文件中查找所有相对路径的图片。 返回一个包含 (匹配文本, 图片路径, 行号, 位置) 的列表 """ relative_images = [] with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 检查 Markdown 图片语法 for match in MD_IMAGE_RE.finditer(content): image_path = match.group(2) if not image_path.startswith(('http://', 'https://', '/')): # 记录行号和位置 line_no = content[:match.start()].count('\n') + 1 position = match.start() relative_images.append((match.group(0), image_path, line_no, position)) # 检查 HTML img 标签 for match in HTML_IMAGE_RE.finditer(content): image_path = match.group(1) if not image_path.startswith(('http://', 'https://', '/')): line_no = content[:match.start()].count('\n') + 1 position = match.start() relative_images.append((match.group(0), image_path, line_no, position)) # 检查 Frame 组件中的相对路径 for match in FRAME_IMAGE_RE.finditer(content): image_path = match.group(1) # 如 /ja-jp/img/... 或 /en-us/img/... 或 /zh-hans/... 这样的路径 if image_path.startswith('/'): line_no = content[:match.start()].count('\n') + 1 position = match.start() relative_images.append((match.group(0), image_path, line_no, position)) # 按位置排序,确保按照文档中的顺序处理 relative_images.sort(key=lambda x: x[3]) # 返回时去掉位置信息 return [(match, path, line) for match, path, line, _ in relative_images] def parse_md_file_for_urls(file_path): """仔细解析Markdown文件以提取在线URL和它们的位置""" urls = [] try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() lines = content.split('\n') # 查找图片URLs for i, line in enumerate(lines): # 查找Markdown风格的图片 for match in MD_ONLINE_URL_RE.finditer(line): url = match.group(1) position = sum(len(l) + 1 for l in lines[:i]) + match.start() urls.append((url, i+1, position)) # 查找HTML风格的图片 for match in HTML_ONLINE_URL_RE.finditer(line): url = match.group(1) position = sum(len(l) + 1 for l in lines[:i]) + match.start() urls.append((url, i+1, position)) # 按文档中的位置排序 urls.sort(key=lambda x: x[2]) return urls except Exception as e: print(f"{Fore.RED}读取文件 {file_path} 时出错: {e}") return [] def find_corresponding_file(mintlify_file, mintlify_base, dify_base): """查找 dify-docs 仓库中对应的文件""" # 获取从 mintlify_base 到文件的相对路径 rel_path = os.path.relpath(mintlify_file, mintlify_base) # 提取语言文件夹(路径的第一个组件) parts = rel_path.split(os.sep) if len(parts) > 0 and parts[0] in LANGUAGE_MAPPING: lang_folder = parts[0] mapped_lang = LANGUAGE_MAPPING[lang_folder] parts[0] = mapped_lang # 用映射后的语言重建路径 rel_path = os.path.join(*parts) # 将扩展名从 .mdx 改为 .md if rel_path.endswith('.mdx'): rel_path = rel_path[:-4] + '.md' # 构建在 dify-docs 中的完整路径 dify_file = os.path.join(dify_base, rel_path) return dify_file if os.path.exists(dify_file) else None def extract_img_basename(path): """从图片路径中提取基本文件名""" # 处理常规和语言前缀路径 # 如 /ja-jp/img/jp-env-variable.png 或 /en-us/img/image.png if path.startswith('/'): parts = path.split('/') if len(parts) > 1: return parts[-1] # 获取最后一部分(文件名) return os.path.basename(path) def get_file_extension(path): """获取文件扩展名""" basename = extract_img_basename(path) if '.' in basename: return basename.split('.')[-1].lower() return None def debug_print_file_comparison(mintlify_file, dify_file): """打印两个文件的内容对比,用于调试""" print(f"\n{Fore.CYAN}======= 文件内容对比 =======") # 读取mintlify文件内容 try: with open(mintlify_file, 'r', encoding='utf-8') as f: mintlify_content = f.read() print(f"\n{Fore.GREEN}Mintlify文件({mintlify_file}):") print(mintlify_content[:500] + "..." if len(mintlify_content) > 500 else mintlify_content) except Exception as e: print(f"{Fore.RED}读取 {mintlify_file} 错误: {e}") # 读取dify文件内容 try: with open(dify_file, 'r', encoding='utf-8') as f: dify_content = f.read() print(f"\n{Fore.YELLOW}Dify文件({dify_file}):") print(dify_content[:500] + "..." if len(dify_content) > 500 else dify_content) except Exception as e: print(f"{Fore.RED}读取 {dify_file} 错误: {e}") # 提取并比较图片URLs mintlify_images = find_relative_images(mintlify_file) dify_urls = parse_md_file_for_urls(dify_file) print(f"\n{Fore.CYAN}Mintlify图片({len(mintlify_images)}):") for i, (_, img_path, _) in enumerate(mintlify_images): print(f"{i+1}. {img_path}") print(f"\n{Fore.CYAN}Dify URLs({len(dify_urls)}):") for i, (url, _, _) in enumerate(dify_urls): print(f"{i+1}. {url}") def match_images_precisely(mintlify_images, dify_file): """精确匹配图片,按照文档位置和上下文""" results = [] dify_urls = parse_md_file_for_urls(dify_file) # 按照位置对应匹配图片和URL for i, (match_text, img_path, line_no) in enumerate(mintlify_images): img_ext = get_file_extension(img_path) # 检查是否有足够的URLs if i < len(dify_urls): # 检查扩展名是否匹配 url_ext = get_file_extension(dify_urls[i][0]) if img_ext and url_ext and img_ext.lower() == url_ext.lower(): results.append((match_text, img_path, line_no, dify_urls[i][0], dify_file)) else: # 尝试在剩余URL中找到匹配的扩展名 found_match = False for j, (url, _, _) in enumerate(dify_urls): if j != i: # 避免重复使用当前位置 j_ext = get_file_extension(url) if img_ext and j_ext and img_ext.lower() == j_ext.lower(): results.append((match_text, img_path, line_no, url, dify_file)) found_match = True break if not found_match: results.append((match_text, img_path, line_no, None, None)) else: results.append((match_text, img_path, line_no, None, None)) return results def get_all_content_after_image(content, image_path): """获取图片后的所有文本内容""" # 查找相对路径的位置 index = content.find(image_path) if index == -1: return "" # 返回图片后的所有内容 return content[index + len(image_path):] def match_images_by_name_and_context(mintlify_file, dify_file): """通过图片名称和上下文匹配图片""" try: # 读取文件内容 with open(mintlify_file, 'r', encoding='utf-8') as f: mintlify_content = f.read() with open(dify_file, 'r', encoding='utf-8') as f: dify_content = f.read() # 获取mintlify文件中的相对图片 mintlify_images = find_relative_images(mintlify_file) # 提取dify文件中的在线URLs dify_urls = parse_md_file_for_urls(dify_file) # 按顺序匹配每个图片 results = [] for match_text, img_path, line_no in mintlify_images: # 提取图片名称和后缀 img_base = extract_img_basename(img_path) img_ext = get_file_extension(img_path) # 获取图片在mintlify文件中的实际位置 img_index = mintlify_content.find(img_path) if img_index == -1: img_index = mintlify_content.find(match_text) # 获取图片后的文本上下文(用于更精确的匹配) after_text = mintlify_content[img_index + len(match_text):img_index + len(match_text) + 200] after_text = re.sub(r'[\n\r\s]+', ' ', after_text).strip() # 尝试通过图片在文档中的位置和顺序匹配 # 首先,确定当前图片是这种类型的第几个图片 same_ext_images = [i for i, (_, p, _) in enumerate(mintlify_images) if get_file_extension(p) == img_ext] current_index = same_ext_images.index(mintlify_images.index((match_text, img_path, line_no))) # 获取对应索引的相同类型URL same_ext_urls = [(i, u) for i, (u, _, _) in enumerate(dify_urls) if get_file_extension(u) == img_ext] if current_index < len(same_ext_urls): # 按顺序匹配 url_index, url = same_ext_urls[current_index] results.append((match_text, img_path, line_no, url, dify_file)) else: # 如果顺序匹配失败,尝试上下文相似度匹配 best_match = None best_score = 0 for _, (url, url_line, _) in enumerate(dify_urls): # 检查扩展名是否匹配 url_ext = get_file_extension(url) if img_ext != url_ext: continue # 获取URL在dify文件中的实际位置 url_index = dify_content.find(url) # 获取URL后的文本上下文 url_after_text = dify_content[url_index + len(url):url_index + len(url) + 200] url_after_text = re.sub(r'[\n\r\s]+', ' ', url_after_text).strip() # 计算上下文相似度 matcher = difflib.SequenceMatcher(None, after_text, url_after_text) score = matcher.ratio() if score > best_score: best_score = score best_match = url if best_match: results.append((match_text, img_path, line_no, best_match, dify_file)) else: results.append((match_text, img_path, line_no, None, None)) return results except Exception as e: print(f"{Fore.RED}匹配图片时出错: {e}") return [] def find_matching_image_url(mintlify_file, dify_file, img_path, order_index=0): """查找匹配的图片URL,考虑多种策略""" # 策略1: 按顺序匹配 # 策略2: 按扩展名匹配 # 策略3: 按上下文匹配 try: # 获取图片扩展名 img_ext = get_file_extension(img_path) if not img_ext: return None # 解析dify文件中的URLs urls = parse_md_file_for_urls(dify_file) # 按顺序和扩展名匹配 # 找出所有扩展名匹配的URLs matching_urls = [] for url, _, _ in urls: url_ext = get_file_extension(url) if url_ext == img_ext: matching_urls.append(url) # 如果没有匹配的URL,返回None if not matching_urls: return None # 如果图片序号在有效范围内,按序号匹配 if order_index < len(matching_urls): return matching_urls[order_index] # 否则返回第一个匹配的URL return matching_urls[0] except Exception as e: print(f"{Fore.RED}查找匹配URL时出错: {e}") return None def validate_content_alignment(mintlify_file, dify_file, img_path, url, check_text_length=200): """验证内容对齐情况,确保图片周围的内容相似""" try: with open(mintlify_file, 'r', encoding='utf-8') as f: mintlify_content = f.read() with open(dify_file, 'r', encoding='utf-8') as f: dify_content = f.read() # 在原始文件中找到图片位置 img_index = mintlify_content.find(img_path) if img_index == -1: return False # 在目标文件中找到URL位置 url_index = dify_content.find(url) if url_index == -1: return False # 获取图片后的文本 img_after = mintlify_content[img_index + len(img_path):img_index + len(img_path) + check_text_length] img_after = re.sub(r'[\n\r\s]+', ' ', img_after).strip() # 获取URL后的文本 url_after = dify_content[url_index + len(url):url_index + len(url) + check_text_length] url_after = re.sub(r'[\n\r\s]+', ' ', url_after).strip() # 计算相似度 matcher = difflib.SequenceMatcher(None, img_after, url_after) ratio = matcher.ratio() # 返回相似度是否超过阈值 return ratio > 0.5 except Exception as e: print(f"{Fore.RED}验证内容对齐时出错: {e}") return False def replace_image_in_file(file_path, match_text, online_url): """在文件中将单个相对图片路径替换为在线 URL""" try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() new_content = content if '