support configuring ignored files in auto sync (#629)

* Add ignore_files config to exclude specific files from translation

Adds ability to specify source language files that should not be translated:
- New `ignore_files` array in config.json
- Validation ensures paths start with source dir, have valid extension, no traversal
- Filtering applied in PRAnalyzer.categorize_files() and SyncPlanGenerator.generate_sync_plan()

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* update config

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Chenhe Gu
2025-12-23 15:45:28 -08:00
committed by GitHub
parent 4f36faa03b
commit 61466c3f45
2 changed files with 66 additions and 3 deletions

View File

@@ -2,6 +2,10 @@
"source_language": "en",
"target_languages": ["zh", "ja"],
"ignore_files": [
"en/self-host/configuration/environments.mdx"
],
"processing_limits": {
"max_files_per_run": 10,
"max_openapi_files_per_run": 5

View File

@@ -27,6 +27,9 @@ class PRAnalyzer:
self.source_language = self.config.get('source_language', 'en')
self.target_languages = self.config.get('target_languages', ['zh', 'ja'])
# Load and validate ignore files
self.ignore_files = self._load_ignore_files()
def _load_config(self) -> Dict:
"""Load translation configuration."""
config_path = Path(__file__).parent / "config.json"
@@ -35,6 +38,55 @@ class PRAnalyzer:
return json.load(f)
return {}
def _load_ignore_files(self) -> List[str]:
"""Load and validate ignore_files configuration.
Validates that:
- Each path starts with source language directory
- No directory traversal (..)
- Valid file extension (.md, .mdx)
Returns:
List of validated ignore file paths
"""
ignore_files = self.config.get('ignore_files', [])
if not ignore_files:
return []
validated = []
source_dir = self.get_language_directory(self.source_language)
for path in ignore_files:
# Must start with source language directory
if not path.startswith(f"{source_dir}/"):
print(f"Warning: Ignore path must start with '{source_dir}/': {path} (skipping)")
continue
# No directory traversal
if ".." in path:
print(f"Warning: Invalid ignore path (contains '..'): {path} (skipping)")
continue
# Must have valid extension
if not any(path.endswith(ext) for ext in ['.md', '.mdx']):
print(f"Warning: Ignore path must end with .md or .mdx: {path} (skipping)")
continue
validated.append(path)
return validated
def _is_file_ignored(self, file_path: str) -> bool:
"""Check if a file should be ignored from translation.
Args:
file_path: Path to check (e.g., 'en/guides/some-file.md')
Returns:
True if file is in ignore list, False otherwise
"""
return file_path in self.ignore_files
def get_language_directory(self, lang_code: str) -> str:
"""Get directory name for a language code from config."""
if 'languages' in self.config and lang_code in self.config['languages']:
@@ -184,16 +236,19 @@ class PRAnalyzer:
if file == 'docs.json':
categories['docs_json'].append(file)
elif file.startswith(f'{source_dir}/'):
if file.endswith(('.md', '.mdx')):
# Check if file is in ignore list
if self._is_file_ignored(file):
categories['other'].append(file) # Treat as 'other' so it's not processed
elif file.endswith(('.md', '.mdx')):
categories['source'].append(file)
elif self.is_openapi_file(file): # NEW
elif self.is_openapi_file(file):
categories['source_openapi'].append(file)
else:
categories['other'].append(file)
elif any(file.startswith(f'{target_dir}/') for target_dir in target_dirs):
if file.endswith(('.md', '.mdx')):
categories['translation'].append(file)
elif self.is_openapi_file(file): # NEW
elif self.is_openapi_file(file):
categories['translation_openapi'].append(file)
else:
categories['other'].append(file)
@@ -462,6 +517,10 @@ class SyncPlanGenerator:
docs_json_changed = True
continue
# Skip ignored files
if self.analyzer._is_file_ignored(filepath):
continue
# Process source language markdown files
if filepath.startswith('en/') and filepath.endswith(('.md', '.mdx')):
file_size = self.get_file_size(filepath)