name: Analyze Documentation Changes on: pull_request: branches: [main, revamp] types: [opened, synchronize, reopened] paths: # IMPORTANT: These paths should match the language directories defined in tools/translate/config.json # Currently configured for: en (source), cn, jp (targets) # If you add/remove languages in config.json, update these paths accordingly - 'docs.json' - 'en/**/*.md' - 'en/**/*.mdx' - 'en/**/openapi*.json' - 'zh/**/*.md' - 'zh/**/*.mdx' - 'zh/**/openapi*.json' - 'ja/**/*.md' - 'ja/**/*.mdx' - 'ja/**/openapi*.json' - 'versions/**/*.md' - 'versions/**/*.mdx' permissions: contents: read pull-requests: read jobs: analyze: runs-on: ubuntu-latest steps: - name: Checkout PR uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.9' - name: Determine comparison range id: determine-range env: GH_TOKEN: ${{ github.token }} run: | echo "Determining comparison range..." PR_NUMBER="${{ github.event.pull_request.number }}" EVENT_ACTION="${{ github.event.action }}" PR_BASE="${{ github.event.pull_request.base.sha }}" PR_HEAD="${{ github.event.pull_request.head.sha }}" if [ "$EVENT_ACTION" = "synchronize" ]; then echo "🔄 Synchronize event - detecting incremental changes" # Try to get last processed commit from translation PR TRANSLATION_PR=$(gh pr list \ --search "head:docs-sync-pr-${PR_NUMBER} state:open" \ --json number \ --jq '.[0].number // empty' 2>/dev/null || echo "") LAST_PROCESSED="" if [ -n "$TRANSLATION_PR" ]; then echo "Found translation PR #${TRANSLATION_PR}" # Extract last processed commit from comments (reverse order to get latest) LAST_PROCESSED=$(gh pr view "$TRANSLATION_PR" \ --json comments \ --jq '.comments | reverse | .[] | .body' 2>/dev/null \ | grep -oP 'Last-Processed-Commit: \K[a-f0-9]+' \ | head -1 || echo "") if [ -n "$LAST_PROCESSED" ]; then echo "✅ Found tracked commit in translation PR: $LAST_PROCESSED" fi fi # Use tracked commit if available, otherwise fall back to github.event.before if [ -n "$LAST_PROCESSED" ]; then COMPARE_BASE="$LAST_PROCESSED" echo "Using last processed commit: $COMPARE_BASE" elif [ -n "${{ github.event.before }}" ] && [ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]; then COMPARE_BASE="${{ github.event.before }}" echo "Using github.event.before: $COMPARE_BASE" else # Fallback to PR base (first push after PR creation) COMPARE_BASE="$PR_BASE" echo "⚠️ No previous commit found, using PR base: $COMPARE_BASE" fi COMPARE_HEAD="$PR_HEAD" IS_INCREMENTAL="true" else echo "🆕 New PR event - analyzing full changes" # Use merge-base to find where branch diverged from main # This allows stale branches to trigger automation without false "mixed content" errors MERGE_BASE=$(git merge-base "$PR_BASE" "$PR_HEAD") echo "Branch diverged from main at: $MERGE_BASE" COMPARE_BASE="$MERGE_BASE" COMPARE_HEAD="$PR_HEAD" IS_INCREMENTAL="false" fi echo "compare_base=$COMPARE_BASE" >> $GITHUB_OUTPUT echo "compare_head=$COMPARE_HEAD" >> $GITHUB_OUTPUT echo "is_incremental=$IS_INCREMENTAL" >> $GITHUB_OUTPUT echo "📊 Comparison range: $COMPARE_BASE...$COMPARE_HEAD" - name: Categorize and validate PR changes id: categorize run: | echo "Categorizing PR changes..." # Get comparison range from previous step BASE_SHA="${{ steps.determine-range.outputs.compare_base }}" HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}" echo "Base SHA: $BASE_SHA" echo "Head SHA: $HEAD_SHA" # Run PR analyzer cd tools/translate python pr_analyzer.py "$BASE_SHA" "$HEAD_SHA" > /tmp/pr_analysis_output.txt 2>&1 # Parse analyzer output if [ $? -eq 0 ]; then # Successful analysis source /tmp/pr_analysis_output.txt echo "PR categorization successful" echo "PR Type: $pr_type" echo "Should Skip: $should_skip" # Set GitHub outputs echo "pr_type=$pr_type" >> $GITHUB_OUTPUT echo "should_skip=$should_skip" >> $GITHUB_OUTPUT if [ "$should_skip" = "true" ]; then if [ "$pr_type" = "translation" ]; then echo "✅ Translation-only PR detected. Skipping automation (direct review process)." elif [ "$pr_type" = "none" ]; then echo "✅ No relevant documentation changes detected. Skipping workflow." fi exit 0 fi else # Analysis failed - likely mixed PR echo "PR categorization failed - likely mixed content PR" ERROR_MESSAGE=$(cat /tmp/pr_analysis_output.txt | grep "error_message=" | cut -d'=' -f2- || echo "Mixed content PR detected") echo "error=mixed_pr" >> $GITHUB_OUTPUT echo "error_message<> $GITHUB_OUTPUT echo "$ERROR_MESSAGE" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT exit 1 fi - name: Analyze source language changes for translation if: steps.categorize.outputs.pr_type == 'source' id: analyze run: | echo "Analyzing source language changes for automatic translation..." # Use comparison range from determine-range step BASE_SHA="${{ steps.determine-range.outputs.compare_base }}" HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}" IS_INCREMENTAL="${{ steps.determine-range.outputs.is_incremental }}" echo "Comparison: $BASE_SHA...$HEAD_SHA" echo "Incremental: $IS_INCREMENTAL" # Get all changed files (not just English ones for file analysis) CHANGED_FILES=$(git diff --name-only $BASE_SHA $HEAD_SHA) # Count changes for security limits FILE_COUNT=$(echo "$CHANGED_FILES" | wc -l) echo "Changed files count: $FILE_COUNT" # Security check: Limit number of files MAX_FILES=50 if [ "$FILE_COUNT" -gt "$MAX_FILES" ]; then echo "Error: Too many files changed ($FILE_COUNT > $MAX_FILES)" echo "error=too_many_files" >> $GITHUB_OUTPUT exit 1 fi # Create analysis report cat > /tmp/analysis.json < /tmp/changed_files.txt # Analyze file types and sizes for source language files that need translation > /tmp/file_analysis.txt > /tmp/openapi_analysis.txt while IFS= read -r file; do if [[ "$file" =~ ^en/.*\.(md|mdx)$ ]] && [ -f "$file" ]; then SIZE=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo "0") echo "$file|$SIZE|markdown" >> /tmp/file_analysis.txt # Security check: File size limit (10MB) MAX_SIZE=$((10 * 1024 * 1024)) if [ "$SIZE" -gt "$MAX_SIZE" ]; then echo "Error: File $file exceeds size limit ($SIZE > $MAX_SIZE)" echo "error=file_too_large" >> $GITHUB_OUTPUT exit 1 fi elif [[ "$file" =~ ^en/.*/openapi.*\.json$ ]] && [ -f "$file" ]; then SIZE=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo "0") echo "$file|$SIZE|openapi_json" >> /tmp/openapi_analysis.txt # Security check: File size limit for OpenAPI JSON (10MB) MAX_SIZE=$((10 * 1024 * 1024)) if [ "$SIZE" -gt "$MAX_SIZE" ]; then echo "Error: OpenAPI file $file exceeds size limit ($SIZE > $MAX_SIZE)" echo "error=file_too_large" >> $GITHUB_OUTPUT exit 1 fi fi done <<< "$CHANGED_FILES" # Check for docs.json changes if echo "$CHANGED_FILES" | grep -q '^docs\.json$'; then echo "true" > /tmp/docs_json_changed.txt # Use PR analyzer's docs.json analysis cd tools/translate python3 - < /tmp/docs_json_changed.txt echo '{"structure_changed": false, "navigation_modified": false, "languages_affected": []}' > /tmp/structure_changes.json fi echo "has_changes=true" >> $GITHUB_OUTPUT echo "Analysis complete" - name: Validate file paths if: steps.analyze.outputs.has_changes == 'true' run: | echo "Validating source language file paths for translation..." # Security: Validate source language files that will be translated while IFS='|' read -r file size; do if [ -n "$file" ]; then # Check for directory traversal attempts if echo "$file" | grep -q '\.\./'; then echo "Error: Invalid file path detected: $file" exit 1 fi # Check file extension for source language files if ! echo "$file" | grep -qE '\.(md|mdx)$'; then echo "Error: Invalid file type for translation: $file" exit 1 fi # Check path starts with en/ (only source language files need translation) if ! echo "$file" | grep -qE '^en/'; then echo "Error: Non-source-language file in translation list: $file" exit 1 fi fi done < /tmp/file_analysis.txt # Validate OpenAPI JSON files if [ -f "/tmp/openapi_analysis.txt" ] && [ -s "/tmp/openapi_analysis.txt" ]; then while IFS='|' read -r file size file_type; do if [ -n "$file" ]; then # Check for directory traversal if echo "$file" | grep -q '\.\./'; then echo "Error: Invalid file path: $file" exit 1 fi # Check file extension if ! echo "$file" | grep -qE '\.json$'; then echo "Error: Invalid OpenAPI file type: $file" exit 1 fi # Check path starts with en/ if ! echo "$file" | grep -qE '^en/'; then echo "Error: Non-source-language OpenAPI file in translation list: $file" exit 1 fi # Check pattern match (configurable via openapi*.json) if ! echo "$file" | grep -qE 'openapi.*\.json$'; then echo "Error: File doesn't match OpenAPI pattern: $file" exit 1 fi fi done < /tmp/openapi_analysis.txt fi echo "All source language file paths validated for translation" - name: Create analysis summary if: steps.analyze.outputs.has_changes == 'true' run: | echo "Creating analysis summary for source language changes..." BASE_SHA="${{ steps.determine-range.outputs.compare_base }}" HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}" PR_NUMBER=${{ github.event.pull_request.number }} IS_INCREMENTAL="${{ steps.determine-range.outputs.is_incremental }}" # Use SyncPlanGenerator for consistent logic across workflows cd tools/translate python3 - <