From 61466c3f45fe0d765a4dac160409aa78297bca8b Mon Sep 17 00:00:00 2001 From: Chenhe Gu Date: Tue, 23 Dec 2025 15:45:28 -0800 Subject: [PATCH] support configuring ignored files in auto sync (#629) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add ignore_files config to exclude specific files from translation Adds ability to specify source language files that should not be translated: - New `ignore_files` array in config.json - Validation ensures paths start with source dir, have valid extension, no traversal - Filtering applied in PRAnalyzer.categorize_files() and SyncPlanGenerator.generate_sync_plan() 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 * update config --------- Co-authored-by: Claude Opus 4.5 --- tools/translate/config.json | 4 +++ tools/translate/pr_analyzer.py | 65 ++++++++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/tools/translate/config.json b/tools/translate/config.json index 54698b00..c8f5b634 100644 --- a/tools/translate/config.json +++ b/tools/translate/config.json @@ -2,6 +2,10 @@ "source_language": "en", "target_languages": ["zh", "ja"], + "ignore_files": [ + "en/self-host/configuration/environments.mdx" + ], + "processing_limits": { "max_files_per_run": 10, "max_openapi_files_per_run": 5 diff --git a/tools/translate/pr_analyzer.py b/tools/translate/pr_analyzer.py index f270983e..bef1c1b4 100644 --- a/tools/translate/pr_analyzer.py +++ b/tools/translate/pr_analyzer.py @@ -27,6 +27,9 @@ class PRAnalyzer: self.source_language = self.config.get('source_language', 'en') self.target_languages = self.config.get('target_languages', ['zh', 'ja']) + # Load and validate ignore files + self.ignore_files = self._load_ignore_files() + def _load_config(self) -> Dict: """Load translation configuration.""" config_path = Path(__file__).parent / "config.json" @@ -35,6 +38,55 @@ class PRAnalyzer: return json.load(f) return {} + def _load_ignore_files(self) -> List[str]: + """Load and validate ignore_files configuration. + + Validates that: + - Each path starts with source language directory + - No directory traversal (..) + - Valid file extension (.md, .mdx) + + Returns: + List of validated ignore file paths + """ + ignore_files = self.config.get('ignore_files', []) + if not ignore_files: + return [] + + validated = [] + source_dir = self.get_language_directory(self.source_language) + + for path in ignore_files: + # Must start with source language directory + if not path.startswith(f"{source_dir}/"): + print(f"Warning: Ignore path must start with '{source_dir}/': {path} (skipping)") + continue + + # No directory traversal + if ".." in path: + print(f"Warning: Invalid ignore path (contains '..'): {path} (skipping)") + continue + + # Must have valid extension + if not any(path.endswith(ext) for ext in ['.md', '.mdx']): + print(f"Warning: Ignore path must end with .md or .mdx: {path} (skipping)") + continue + + validated.append(path) + + return validated + + def _is_file_ignored(self, file_path: str) -> bool: + """Check if a file should be ignored from translation. + + Args: + file_path: Path to check (e.g., 'en/guides/some-file.md') + + Returns: + True if file is in ignore list, False otherwise + """ + return file_path in self.ignore_files + def get_language_directory(self, lang_code: str) -> str: """Get directory name for a language code from config.""" if 'languages' in self.config and lang_code in self.config['languages']: @@ -184,16 +236,19 @@ class PRAnalyzer: if file == 'docs.json': categories['docs_json'].append(file) elif file.startswith(f'{source_dir}/'): - if file.endswith(('.md', '.mdx')): + # Check if file is in ignore list + if self._is_file_ignored(file): + categories['other'].append(file) # Treat as 'other' so it's not processed + elif file.endswith(('.md', '.mdx')): categories['source'].append(file) - elif self.is_openapi_file(file): # NEW + elif self.is_openapi_file(file): categories['source_openapi'].append(file) else: categories['other'].append(file) elif any(file.startswith(f'{target_dir}/') for target_dir in target_dirs): if file.endswith(('.md', '.mdx')): categories['translation'].append(file) - elif self.is_openapi_file(file): # NEW + elif self.is_openapi_file(file): categories['translation_openapi'].append(file) else: categories['other'].append(file) @@ -462,6 +517,10 @@ class SyncPlanGenerator: docs_json_changed = True continue + # Skip ignored files + if self.analyzer._is_file_ignored(filepath): + continue + # Process source language markdown files if filepath.startswith('en/') and filepath.endswith(('.md', '.mdx')): file_size = self.get_file_size(filepath)