diff --git a/tools/translate/sync_and_translate.py b/tools/translate/sync_and_translate.py index a375066e..b5a8247e 100644 --- a/tools/translate/sync_and_translate.py +++ b/tools/translate/sync_and_translate.py @@ -1383,11 +1383,19 @@ class DocsSynchronizer: deleted_files: List[str] = None, renamed_files: List[Tuple[str, str]] = None, base_sha: str = None, - head_sha: str = None + head_sha: str = None, + reference_sha: str = None ) -> List[str]: """ Incrementally sync docs.json structure - only processes changed files. Preserves existing dropdown names and only updates affected pages. + + Args: + reference_sha: If provided, use this commit's docs.json for finding file positions + in the English section. This is needed when the working directory's + docs.json (from main) doesn't have the new files yet (stale PR scenario). + The translation sections are still modified in the working directory's + docs.json. """ sync_log = [] added_files = added_files or [] @@ -1447,7 +1455,7 @@ class DocsSynchronizer: sync_log.append("ERROR: No languages found in navigation") return sync_log - # Find language sections + # Find language sections in working directory docs.json (for modifying translations) source_section = None target_sections = {} @@ -1457,6 +1465,40 @@ class DocsSynchronizer: elif lang_data.get("language") in self.target_languages: target_sections[lang_data.get("language")] = lang_data + # If reference_sha is provided, load that docs.json for finding file positions + # This is needed when working directory (main) doesn't have the new files yet + reference_source_section = source_section + if reference_sha and added_files: + try: + import subprocess + result = subprocess.run( + ["git", "show", f"{reference_sha}:docs.json"], + cwd=self.base_dir, + capture_output=True, + text=True, + check=True + ) + ref_docs = json.loads(result.stdout) + ref_navigation = ref_docs.get("navigation", {}) + + # Handle both structures + ref_languages = None + if "languages" in ref_navigation and isinstance(ref_navigation["languages"], list): + ref_languages = ref_navigation["languages"] + elif "versions" in ref_navigation and len(ref_navigation["versions"]) > 0: + if "languages" in ref_navigation["versions"][0]: + ref_languages = ref_navigation["versions"][0]["languages"] + + if ref_languages: + for lang_data in ref_languages: + if lang_data.get("language") == self.source_language: + reference_source_section = lang_data + sync_log.append(f"INFO: Using reference docs.json from {reference_sha[:8]} for finding file positions") + break + except Exception as e: + sync_log.append(f"WARNING: Could not load reference docs.json from {reference_sha}: {e}") + # Fall back to working directory's source section + if not source_section: sync_log.append("ERROR: Source language section not found") return sync_log @@ -1469,7 +1511,8 @@ class DocsSynchronizer: continue # Find which dropdown contains this file in source language section - result = self.find_dropdown_containing_file(source_file, source_section) + # Use reference_source_section (from PR's docs.json) to find new file positions + result = self.find_dropdown_containing_file(source_file, reference_source_section) if not result: sync_log.append(f"WARNING: Could not find {source_file} in source language navigation") continue @@ -1478,9 +1521,10 @@ class DocsSynchronizer: sync_log.append(f"INFO: Found {source_file} in '{source_dropdown_name}' dropdown at location {file_location}") # Get the source language dropdown for reference + # Use reference_source_section (from PR's docs.json) for new file structure source_dropdown = None source_dropdown_index = -1 - for i, dropdown in enumerate(source_section.get("dropdowns", [])): + for i, dropdown in enumerate(reference_source_section.get("dropdowns", [])): if dropdown.get("dropdown") == source_dropdown_name: source_dropdown = dropdown source_dropdown_index = i @@ -1490,6 +1534,40 @@ class DocsSynchronizer: sync_log.append(f"WARNING: Could not find source language dropdown '{source_dropdown_name}'") continue + # STALE PR FIX: Also update the working directory's EN section if the file + # was found in reference docs.json but doesn't exist in current EN section. + # This happens when the translation branch is based on main but the PR's + # docs.json hasn't merged yet. + if reference_source_section != source_section: + # Find or create corresponding dropdown in working directory's EN section + wd_en_dropdown = None + wd_en_dropdowns = source_section.get("dropdowns", []) + if source_dropdown_index >= 0 and source_dropdown_index < len(wd_en_dropdowns): + wd_en_dropdown = wd_en_dropdowns[source_dropdown_index] + + if not wd_en_dropdown: + # Dropdown doesn't exist in working directory - create it + wd_en_dropdown = { + "dropdown": source_dropdown.get("dropdown", ""), + "icon": source_dropdown.get("icon", "book-open"), + "pages": [] + } + source_section.setdefault("dropdowns", []) + # Ensure we have enough slots + while len(source_section["dropdowns"]) <= source_dropdown_index: + source_section["dropdowns"].append(None) + source_section["dropdowns"][source_dropdown_index] = wd_en_dropdown + sync_log.append(f"INFO: Created EN dropdown '{wd_en_dropdown['dropdown']}' from PR") + + if wd_en_dropdown: + # Add the page to EN section at the correct location + if "pages" not in wd_en_dropdown: + wd_en_dropdown["pages"] = [] + + added_to_en = self.add_page_at_location(wd_en_dropdown, source_file, file_location, source_dropdown) + if added_to_en: + sync_log.append(f"INFO: Added {source_file} to EN section (from PR's docs.json)") + # Add to each target language for target_lang, target_section in target_sections.items(): target_file = self.convert_path_to_target_language(source_file, target_lang) @@ -1548,6 +1626,14 @@ class DocsSynchronizer: sync_log.append(f"INFO: Processing deletion of {source_file}") + # STALE PR FIX: Also remove from EN section if it exists there + # This is needed when starting from main's docs.json which may still have the file + for en_dropdown in source_section.get("dropdowns", []): + if "pages" in en_dropdown: + if self.remove_page_from_structure(en_dropdown["pages"], source_file): + sync_log.append(f"INFO: Removed {source_file} from EN section") + break + # Remove from each target language for target_lang, target_section in target_sections.items(): target_file = self.convert_path_to_target_language(source_file, target_lang) diff --git a/tools/translate/translate_pr.py b/tools/translate/translate_pr.py index 799e33c3..ab33dff6 100755 --- a/tools/translate/translate_pr.py +++ b/tools/translate/translate_pr.py @@ -128,103 +128,155 @@ class TranslationPRManager: ) return result.returncode == 0 - def merge_docs_json_for_incremental_update(self) -> None: + def _merge_docs_json_from_main(self) -> None: """ Merge docs.json for incremental updates: - - Source language section from PR HEAD (latest structure) - - Translation sections from translation branch (preserve existing translations) + - Full structure from main (latest, includes changes from all merged PRs) + - Translation sections from translation branch (preserve our existing translations) + + This fixes the issue where stale PRs would revert changes from PRs merged after them. """ - print("Merging docs.json: Source language from PR, translations from translation branch...") + print("Merging docs.json: structure from main, translations from branch...") - # Get docs.json from PR HEAD (has latest source language structure) - result = self.run_git("show", f"{self.head_sha}:docs.json") - pr_docs = json.loads(result.stdout) + # Get docs.json from main (has latest structure including all merged PRs) + result = self.run_git("show", "origin/main:docs.json") + main_docs = json.loads(result.stdout) - # Get docs.json from translation branch (has target language translations) + # Get docs.json from translation branch (has our translations) docs_json_path = self.repo_root / "docs.json" with open(docs_json_path, 'r', encoding='utf-8') as f: - translation_docs = json.load(f) + branch_docs = json.load(f) - # Merge strategy: Replace source language section from PR, keep translations from translation branch # Navigate to language sections - pr_navigation = pr_docs.get("navigation", {}) - translation_navigation = translation_docs.get("navigation", {}) + main_navigation = main_docs.get("navigation", {}) + branch_navigation = branch_docs.get("navigation", {}) # Handle both direct languages and versions structure - if "versions" in pr_navigation: - pr_languages = pr_navigation["versions"][0].get("languages", []) - translation_languages = translation_navigation.get("versions", [{}])[0].get("languages", []) + if "versions" in main_navigation: + main_languages = main_navigation["versions"][0].get("languages", []) + branch_languages = branch_navigation.get("versions", [{}])[0].get("languages", []) else: - pr_languages = pr_navigation.get("languages", []) - translation_languages = translation_navigation.get("languages", []) + main_languages = main_navigation.get("languages", []) + branch_languages = branch_navigation.get("languages", []) - # Build language lookup from translation branch - translation_langs_by_code = {} - for lang_data in translation_languages: + # Build lookup from translation branch + branch_langs_by_code = {} + for lang_data in branch_languages: lang_code = lang_data.get("language") if lang_code: - translation_langs_by_code[lang_code] = lang_data + branch_langs_by_code[lang_code] = lang_data - # Merge: Use source language from PR, translations from translation branch + # Merge strategy: + # - Use main's structure for everything (includes all recent changes) + # - For target languages, preserve our translations from branch merged_languages = [] - for pr_lang in pr_languages: - lang_code = pr_lang.get("language") + for main_lang in main_languages: + lang_code = main_lang.get("language") - if lang_code == self.source_language: - # Use source language section from PR (latest structure) - merged_languages.append(pr_lang) - elif lang_code in translation_langs_by_code: - # Use translations from translation branch (preserve existing translations) - merged_languages.append(translation_langs_by_code[lang_code]) + if lang_code in self.target_languages and lang_code in branch_langs_by_code: + # For target languages, use branch's version (has our translations) + merged_languages.append(branch_langs_by_code[lang_code]) else: - # Fallback: use from PR - merged_languages.append(pr_lang) + # For source language and others, use main's version + merged_languages.append(main_lang) # Update the merged docs.json - merged_docs = pr_docs.copy() - if "versions" in pr_navigation: + merged_docs = main_docs.copy() + if "versions" in main_navigation: merged_docs["navigation"]["versions"][0]["languages"] = merged_languages else: merged_docs["navigation"]["languages"] = merged_languages # Write merged docs.json preserving original formatting - # Use translation branch's docs.json as reference for format detection success = save_json_with_preserved_format( str(docs_json_path), merged_docs, - reference_file=str(docs_json_path) # Current file from translation branch + reference_file=str(docs_json_path) ) if success: - print(f"✓ Merged docs.json: Source language from PR {self.head_sha[:8]}, translations from {self.sync_branch}") + print(f"✓ Merged docs.json: structure from main, translations from {self.sync_branch}") else: print(f"⚠️ Warning: Could not preserve formatting, using default") - # Fallback to standard json.dump if format preservation fails with open(docs_json_path, 'w', encoding='utf-8') as f: json.dump(merged_docs, f, indent=2, ensure_ascii=False) + def _checkout_pr_changed_files(self, files_to_sync: List) -> None: + """ + Checkout only the specific source language files that the PR changed. + + This prevents overwriting files from other PRs that were merged after this PR was created. + By only checking out the files this PR actually changed, we preserve the current state + of all other files (from main or translation branch). + """ + source_files = [] + for file_info in files_to_sync: + file_path = file_info.get("path") if isinstance(file_info, dict) else file_info + # Only checkout source language files (not docs.json - handled separately) + if file_path.startswith(f"{self.source_dir}/") and file_path != "docs.json": + source_files.append(file_path) + + if not source_files: + print("No source files to checkout from PR") + return + + print(f"Checking out {len(source_files)} changed source files from PR {self.head_sha[:8]}...") + + for file_path in source_files: + result = self.run_git("checkout", self.head_sha, "--", file_path, check=False) + if result.returncode == 0: + print(f" ✓ {file_path}") + else: + # File might be deleted in PR or not exist - that's OK + print(f" ⚠️ {file_path} (not in PR commit - may be deleted)") + def setup_translation_branch(self, branch_exists: bool) -> None: - """Setup the translation branch (create or checkout existing).""" + """ + Setup the translation branch (create or checkout existing). + + Key fix for stale PR issue: + - For NEW branches: Start from origin/main (not PR's working directory) + This ensures we have the latest state including changes from PRs merged after this PR was created. + - For EXISTING branches: Checkout branch, then merge main's docs.json structure + This preserves our translations while getting the latest navigation structure. + + The actual PR files to translate are checked out selectively later in run_translation_from_sync_plan(). + """ + # Store for later use in run_translation_from_sync_plan + self._branch_existed = branch_exists + if branch_exists: print(f"✅ Fetching existing translation branch for incremental update: {self.sync_branch}") self.run_git("fetch", "origin", f"{self.sync_branch}:{self.sync_branch}") + self.run_git("fetch", "origin", "main") self.run_git("checkout", self.sync_branch) - # For incremental updates, checkout source language files only (not docs.json) - print(f"Checking out source language files from {self.head_sha[:8]}...") - self.run_git("checkout", self.head_sha, "--", f"{self.source_dir}/", check=False) + # Merge main's docs.json structure with our translations + # This ensures we have the latest navigation structure from main + # while preserving our existing translations + self._merge_docs_json_from_main() - # Merge docs.json: Source language from PR HEAD, translations from translation branch - self.merge_docs_json_for_incremental_update() + # Note: Source language files will be checked out selectively later + # in run_translation_from_sync_plan() based on what the PR actually changed else: print(f"🆕 Creating new translation branch: {self.sync_branch}") - self.run_git("checkout", "-b", self.sync_branch) - # Reset branch to main to avoid including source language file changes from PR - # Use --soft to keep working directory with PR files (needed for translation) - self.run_git("reset", "--soft", "origin/main") - # Unstage everything - self.run_git("reset") + # CRITICAL FIX: Start from origin/main, not from the PR's working directory + # This ensures we have the latest state including all changes from PRs + # that were merged after this PR was created. + # + # Old approach (buggy): + # checkout -b branch → reset --soft origin/main → reset + # This kept PR's working directory which could be stale + # + # New approach (fixed): + # checkout -b branch origin/main + # This starts fresh from main, then we selectively checkout PR's changed files + self.run_git("fetch", "origin", "main") + self.run_git("checkout", "-b", self.sync_branch, "origin/main") + + # Note: Source language files will be checked out selectively later + # in run_translation_from_sync_plan() based on what the PR actually changed async def run_translation(self) -> Dict: """Run the translation process using sync_and_translate logic.""" @@ -286,6 +338,11 @@ class TranslationPRManager: base_sha = metadata.get("base_sha", self.base_sha) head_sha = metadata.get("head_sha", self.head_sha) + # CRITICAL: Checkout only the files that this PR actually changed + # This is the key fix for the stale PR issue - we only bring in the PR's + # changed files, not its entire working directory state which could be outdated + self._checkout_pr_changed_files(files_to_sync) + # Detect added vs modified files and renames added_files, modified_files, renamed_files = self.detect_file_changes(base_sha, head_sha) @@ -500,12 +557,15 @@ class TranslationPRManager: break # Sync docs.json incrementally + # Pass head_sha as reference_sha so sync_docs_json_incremental can find + # new file positions in PR's docs.json (needed when main doesn't have them yet) sync_log = synchronizer.sync_docs_json_incremental( added_files=added_files, deleted_files=deleted_files, renamed_files=renamed_files, base_sha=base_sha, - head_sha=head_sha + head_sha=head_sha, + reference_sha=head_sha ) print("\n".join(sync_log))