From 61466c3f45fe0d765a4dac160409aa78297bca8b Mon Sep 17 00:00:00 2001
From: Chenhe Gu <guchenhe@gmail.com>
Date: Tue, 23 Dec 2025 15:45:28 -0800
Subject: [PATCH] support configuring ignored files in auto sync (#629)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add ignore_files config to exclude specific files from translation

Adds ability to specify source language files that should not be translated:
- New `ignore_files` array in config.json
- Validation ensures paths start with source dir, have valid extension, no traversal
- Filtering applied in PRAnalyzer.categorize_files() and SyncPlanGenerator.generate_sync_plan()

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* update config

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 tools/translate/config.json    |  4 +++
 tools/translate/pr_analyzer.py | 65 ++++++++++++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/tools/translate/config.json b/tools/translate/config.json
index 54698b00..c8f5b634 100644
--- a/tools/translate/config.json
+++ b/tools/translate/config.json
@@ -2,6 +2,10 @@
     "source_language": "en",
     "target_languages": ["zh", "ja"],
 
+    "ignore_files": [
+        "en/self-host/configuration/environments.mdx"
+    ],
+
     "processing_limits": {
         "max_files_per_run": 10,
         "max_openapi_files_per_run": 5
diff --git a/tools/translate/pr_analyzer.py b/tools/translate/pr_analyzer.py
index f270983e..bef1c1b4 100644
--- a/tools/translate/pr_analyzer.py
+++ b/tools/translate/pr_analyzer.py
@@ -27,6 +27,9 @@ class PRAnalyzer:
         self.source_language = self.config.get('source_language', 'en')
         self.target_languages = self.config.get('target_languages', ['zh', 'ja'])
 
+        # Load and validate ignore files
+        self.ignore_files = self._load_ignore_files()
+
     def _load_config(self) -> Dict:
         """Load translation configuration."""
         config_path = Path(__file__).parent / "config.json"
@@ -35,6 +38,55 @@ class PRAnalyzer:
                 return json.load(f)
         return {}
 
+    def _load_ignore_files(self) -> List[str]:
+        """Load and validate ignore_files configuration.
+
+        Validates that:
+        - Each path starts with source language directory
+        - No directory traversal (..)
+        - Valid file extension (.md, .mdx)
+
+        Returns:
+            List of validated ignore file paths
+        """
+        ignore_files = self.config.get('ignore_files', [])
+        if not ignore_files:
+            return []
+
+        validated = []
+        source_dir = self.get_language_directory(self.source_language)
+
+        for path in ignore_files:
+            # Must start with source language directory
+            if not path.startswith(f"{source_dir}/"):
+                print(f"Warning: Ignore path must start with '{source_dir}/': {path} (skipping)")
+                continue
+
+            # No directory traversal
+            if ".." in path:
+                print(f"Warning: Invalid ignore path (contains '..'): {path} (skipping)")
+                continue
+
+            # Must have valid extension
+            if not any(path.endswith(ext) for ext in ['.md', '.mdx']):
+                print(f"Warning: Ignore path must end with .md or .mdx: {path} (skipping)")
+                continue
+
+            validated.append(path)
+
+        return validated
+
+    def _is_file_ignored(self, file_path: str) -> bool:
+        """Check if a file should be ignored from translation.
+
+        Args:
+            file_path: Path to check (e.g., 'en/guides/some-file.md')
+
+        Returns:
+            True if file is in ignore list, False otherwise
+        """
+        return file_path in self.ignore_files
+
     def get_language_directory(self, lang_code: str) -> str:
         """Get directory name for a language code from config."""
         if 'languages' in self.config and lang_code in self.config['languages']:
@@ -184,16 +236,19 @@ class PRAnalyzer:
             if file == 'docs.json':
                 categories['docs_json'].append(file)
             elif file.startswith(f'{source_dir}/'):
-                if file.endswith(('.md', '.mdx')):
+                # Check if file is in ignore list
+                if self._is_file_ignored(file):
+                    categories['other'].append(file)  # Treat as 'other' so it's not processed
+                elif file.endswith(('.md', '.mdx')):
                     categories['source'].append(file)
-                elif self.is_openapi_file(file):  # NEW
+                elif self.is_openapi_file(file):
                     categories['source_openapi'].append(file)
                 else:
                     categories['other'].append(file)
             elif any(file.startswith(f'{target_dir}/') for target_dir in target_dirs):
                 if file.endswith(('.md', '.mdx')):
                     categories['translation'].append(file)
-                elif self.is_openapi_file(file):  # NEW
+                elif self.is_openapi_file(file):
                     categories['translation_openapi'].append(file)
                 else:
                     categories['other'].append(file)
@@ -462,6 +517,10 @@ class SyncPlanGenerator:
                 docs_json_changed = True
                 continue
 
+            # Skip ignored files
+            if self.analyzer._is_file_ignored(filepath):
+                continue
+
             # Process source language markdown files
             if filepath.startswith('en/') and filepath.endswith(('.md', '.mdx')):
                 file_size = self.get_file_size(filepath)