dify-docs/.github/workflows/sync_docs_analyze.yml

name: Analyze Documentation Changes

on:
  pull_request:
    branches: [main, revamp]
    types: [opened, synchronize, reopened]
    paths:
      # IMPORTANT: These paths should match the language directories defined in tools/translate/config.json
      # Currently configured for: en (source), cn, jp (targets)
      # If you add/remove languages in config.json, update these paths accordingly
      - 'docs.json'
      - 'en/**/*.md'
      - 'en/**/*.mdx'
      - 'en/**/openapi*.json'
      - 'zh/**/*.md'
      - 'zh/**/*.mdx'
      - 'zh/**/openapi*.json'
      - 'ja/**/*.md'
      - 'ja/**/*.mdx'
      - 'ja/**/openapi*.json'
      - 'versions/**/*.md'
      - 'versions/**/*.mdx'

permissions:
  contents: read
  pull-requests: read

jobs:
  analyze:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PR
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.9'

      - name: Determine comparison range
        id: determine-range
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          echo "Determining comparison range..."

          PR_NUMBER="${{ github.event.pull_request.number }}"
          EVENT_ACTION="${{ github.event.action }}"
          PR_BASE="${{ github.event.pull_request.base.sha }}"
          PR_HEAD="${{ github.event.pull_request.head.sha }}"

          if [ "$EVENT_ACTION" = "synchronize" ]; then
            echo "🔄 Synchronize event - detecting incremental changes"

            # Try to get last processed commit from translation PR
            TRANSLATION_PR=$(gh pr list \
              --search "head:docs-sync-pr-${PR_NUMBER} state:open" \
              --json number \
              --jq '.[0].number // empty' 2>/dev/null || echo "")

            LAST_PROCESSED=""
            if [ -n "$TRANSLATION_PR" ]; then
              echo "Found translation PR #${TRANSLATION_PR}"

              # Extract last processed commit from comments (reverse order to get latest)
              LAST_PROCESSED=$(gh pr view "$TRANSLATION_PR" \
                --json comments \
                --jq '.comments | reverse | .[] | .body' 2>/dev/null \
                | grep -oP 'Last-Processed-Commit: \K[a-f0-9]+' \
                | head -1 || echo "")

              if [ -n "$LAST_PROCESSED" ]; then
                echo "✅ Found tracked commit in translation PR: $LAST_PROCESSED"
              fi
            fi

            # Use tracked commit if available, otherwise fall back to github.event.before
            if [ -n "$LAST_PROCESSED" ]; then
              COMPARE_BASE="$LAST_PROCESSED"
              echo "Using last processed commit: $COMPARE_BASE"
            elif [ -n "${{ github.event.before }}" ] && [ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]; then
              COMPARE_BASE="${{ github.event.before }}"
              echo "Using github.event.before: $COMPARE_BASE"
            else
              # Fallback to PR base (first push after PR creation)
              COMPARE_BASE="$PR_BASE"
              echo "⚠️ No previous commit found, using PR base: $COMPARE_BASE"
            fi

            COMPARE_HEAD="$PR_HEAD"
            IS_INCREMENTAL="true"

          else
            echo "🆕 New PR event - analyzing full changes"

            # Use merge-base to find where branch diverged from main
            # This allows stale branches to trigger automation without false "mixed content" errors
            MERGE_BASE=$(git merge-base "$PR_BASE" "$PR_HEAD")
            echo "Branch diverged from main at: $MERGE_BASE"

            COMPARE_BASE="$MERGE_BASE"
            COMPARE_HEAD="$PR_HEAD"
            IS_INCREMENTAL="false"
          fi

          echo "compare_base=$COMPARE_BASE" >> $GITHUB_OUTPUT
          echo "compare_head=$COMPARE_HEAD" >> $GITHUB_OUTPUT
          echo "is_incremental=$IS_INCREMENTAL" >> $GITHUB_OUTPUT

          echo "📊 Comparison range: $COMPARE_BASE...$COMPARE_HEAD"

      - name: Categorize and validate PR changes
        id: categorize
        run: |
          echo "Categorizing PR changes..."

          # Get comparison range from previous step
          BASE_SHA="${{ steps.determine-range.outputs.compare_base }}"
          HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}"

          echo "Base SHA: $BASE_SHA"
          echo "Head SHA: $HEAD_SHA"

          # Run PR analyzer
          cd tools/translate
          python pr_analyzer.py "$BASE_SHA" "$HEAD_SHA" > /tmp/pr_analysis_output.txt 2>&1

          # Parse analyzer output
          if [ $? -eq 0 ]; then
            # Successful analysis
            source /tmp/pr_analysis_output.txt
            echo "PR categorization successful"
            echo "PR Type: $pr_type"
            echo "Should Skip: $should_skip"

            # Set GitHub outputs
            echo "pr_type=$pr_type" >> $GITHUB_OUTPUT
            echo "should_skip=$should_skip" >> $GITHUB_OUTPUT

            if [ "$should_skip" = "true" ]; then
              if [ "$pr_type" = "translation" ]; then
                echo "✅ Translation-only PR detected. Skipping automation (direct review process)."
              elif [ "$pr_type" = "none" ]; then
                echo "✅ No relevant documentation changes detected. Skipping workflow."
              fi
              exit 0
            fi
          else
            # Analysis failed - likely mixed PR
            echo "PR categorization failed - likely mixed content PR"
            ERROR_MESSAGE=$(cat /tmp/pr_analysis_output.txt | grep "error_message=" | cut -d'=' -f2- || echo "Mixed content PR detected")
            echo "error=mixed_pr" >> $GITHUB_OUTPUT
            echo "error_message<<EOF" >> $GITHUB_OUTPUT
            echo "$ERROR_MESSAGE" >> $GITHUB_OUTPUT
            echo "EOF" >> $GITHUB_OUTPUT
            exit 1
          fi

      - name: Analyze source language changes for translation
        if: steps.categorize.outputs.pr_type == 'source'
        id: analyze
        run: |
          echo "Analyzing source language changes for automatic translation..."

          # Use comparison range from determine-range step
          BASE_SHA="${{ steps.determine-range.outputs.compare_base }}"
          HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}"
          IS_INCREMENTAL="${{ steps.determine-range.outputs.is_incremental }}"

          echo "Comparison: $BASE_SHA...$HEAD_SHA"
          echo "Incremental: $IS_INCREMENTAL"

          # Get all changed files (not just English ones for file analysis)
          CHANGED_FILES=$(git diff --name-only $BASE_SHA $HEAD_SHA)

          # Count changes for security limits
          FILE_COUNT=$(echo "$CHANGED_FILES" | wc -l)
          echo "Changed files count: $FILE_COUNT"

          # Security check: Limit number of files
          MAX_FILES=50
          if [ "$FILE_COUNT" -gt "$MAX_FILES" ]; then
            echo "Error: Too many files changed ($FILE_COUNT > $MAX_FILES)"
            echo "error=too_many_files" >> $GITHUB_OUTPUT
            exit 1
          fi

          # Create analysis report
          cat > /tmp/analysis.json <<EOF
          {
            "pr_number": ${{ github.event.pull_request.number }},
            "pr_title": "${{ github.event.pull_request.title }}",
            "pr_author": "${{ github.event.pull_request.user.login }}",
            "base_sha": "$BASE_SHA",
            "head_sha": "$HEAD_SHA",
            "is_incremental": $IS_INCREMENTAL,
            "event_action": "${{ github.event.action }}",
            "file_count": $FILE_COUNT,
            "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
            "repository": "${{ github.repository }}",
            "ref": "${{ github.ref }}",
            "pr_type": "source"
          }
          EOF

          # Save changed files list
          echo "$CHANGED_FILES" > /tmp/changed_files.txt

          # Analyze file types and sizes for source language files that need translation
          > /tmp/file_analysis.txt
          > /tmp/openapi_analysis.txt
          while IFS= read -r file; do
            if [[ "$file" =~ ^en/.*\.(md|mdx)$ ]] && [ -f "$file" ]; then
              SIZE=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo "0")
              echo "$file|$SIZE|markdown" >> /tmp/file_analysis.txt

              # Security check: File size limit (10MB)
              MAX_SIZE=$((10 * 1024 * 1024))
              if [ "$SIZE" -gt "$MAX_SIZE" ]; then
                echo "Error: File $file exceeds size limit ($SIZE > $MAX_SIZE)"
                echo "error=file_too_large" >> $GITHUB_OUTPUT
                exit 1
              fi
            elif [[ "$file" =~ ^en/.*/openapi.*\.json$ ]] && [ -f "$file" ]; then
              SIZE=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo "0")
              echo "$file|$SIZE|openapi_json" >> /tmp/openapi_analysis.txt

              # Security check: File size limit for OpenAPI JSON (10MB)
              MAX_SIZE=$((10 * 1024 * 1024))
              if [ "$SIZE" -gt "$MAX_SIZE" ]; then
                echo "Error: OpenAPI file $file exceeds size limit ($SIZE > $MAX_SIZE)"
                echo "error=file_too_large" >> $GITHUB_OUTPUT
                exit 1
              fi
            fi
          done <<< "$CHANGED_FILES"

          # Check for docs.json changes
          if echo "$CHANGED_FILES" | grep -q '^docs\.json$'; then
            echo "true" > /tmp/docs_json_changed.txt

            # Use PR analyzer's docs.json analysis
            cd tools/translate
            python3 - <<EOF
          import sys
          sys.path.append('.')
          from pr_analyzer import PRAnalyzer

          analyzer = PRAnalyzer("$BASE_SHA", "$HEAD_SHA")
          docs_changes = analyzer.analyze_docs_json_changes()

          structure_changes = {
              "structure_changed": docs_changes["any_docs_json_changes"],
              "navigation_modified": docs_changes["source_section"],
              "languages_affected": analyzer.config["target_languages"] if docs_changes["source_section"] else []
          }

          import json
          with open("/tmp/structure_changes.json", "w") as f:
              json.dump(structure_changes, f, indent=2)
          EOF
          else
            echo "false" > /tmp/docs_json_changed.txt
            echo '{"structure_changed": false, "navigation_modified": false, "languages_affected": []}' > /tmp/structure_changes.json
          fi

          echo "has_changes=true" >> $GITHUB_OUTPUT
          echo "Analysis complete"

      - name: Validate file paths
        if: steps.analyze.outputs.has_changes == 'true'
        run: |
          echo "Validating source language file paths for translation..."

          # Security: Validate source language files that will be translated
          while IFS='|' read -r file size; do
            if [ -n "$file" ]; then
              # Check for directory traversal attempts
              if echo "$file" | grep -q '\.\./'; then
                echo "Error: Invalid file path detected: $file"
                exit 1
              fi

              # Check file extension for source language files
              if ! echo "$file" | grep -qE '\.(md|mdx)$'; then
                echo "Error: Invalid file type for translation: $file"
                exit 1
              fi

              # Check path starts with en/ (only source language files need translation)
              if ! echo "$file" | grep -qE '^en/'; then
                echo "Error: Non-source-language file in translation list: $file"
                exit 1
              fi
            fi
          done < /tmp/file_analysis.txt

          # Validate OpenAPI JSON files
          if [ -f "/tmp/openapi_analysis.txt" ] && [ -s "/tmp/openapi_analysis.txt" ]; then
            while IFS='|' read -r file size file_type; do
              if [ -n "$file" ]; then
                # Check for directory traversal
                if echo "$file" | grep -q '\.\./'; then
                  echo "Error: Invalid file path: $file"
                  exit 1
                fi

                # Check file extension
                if ! echo "$file" | grep -qE '\.json$'; then
                  echo "Error: Invalid OpenAPI file type: $file"
                  exit 1
                fi

                # Check path starts with en/
                if ! echo "$file" | grep -qE '^en/'; then
                  echo "Error: Non-source-language OpenAPI file in translation list: $file"
                  exit 1
                fi

                # Check pattern match (configurable via openapi*.json)
                if ! echo "$file" | grep -qE 'openapi.*\.json$'; then
                  echo "Error: File doesn't match OpenAPI pattern: $file"
                  exit 1
                fi
              fi
            done < /tmp/openapi_analysis.txt
          fi

          echo "All source language file paths validated for translation"

      - name: Create analysis summary
        if: steps.analyze.outputs.has_changes == 'true'
        run: |
          echo "Creating analysis summary for source language changes..."

          BASE_SHA="${{ steps.determine-range.outputs.compare_base }}"
          HEAD_SHA="${{ steps.determine-range.outputs.compare_head }}"
          PR_NUMBER=${{ github.event.pull_request.number }}
          IS_INCREMENTAL="${{ steps.determine-range.outputs.is_incremental }}"

          # Use SyncPlanGenerator for consistent logic across workflows
          cd tools/translate
          python3 - <<EOF
          import json
          import sys
          sys.path.append('.')
          from pr_analyzer import SyncPlanGenerator

          # Generate sync plan using centralized logic
          generator = SyncPlanGenerator("$BASE_SHA", "$HEAD_SHA")
          sync_plan = generator.generate_sync_plan()

          # Add PR metadata to sync plan
          sync_plan["metadata"].update({
              "pr_number": $PR_NUMBER,
              "pr_title": "${{ github.event.pull_request.title }}",
              "pr_author": "${{ github.event.pull_request.user.login }}",
              "event_action": "${{ github.event.action }}",
              "is_incremental": "$IS_INCREMENTAL" == "true",
              "file_count": len(sync_plan["files_to_sync"]) + len(sync_plan["openapi_files_to_sync"]),
              "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
              "repository": "${{ github.repository }}",
              "ref": "${{ github.ref }}",
              "pr_type": "source"
          })

          # Save analysis.json (for backward compatibility with execute workflow)
          with open("/tmp/analysis.json", "w") as f:
              json.dump(sync_plan["metadata"], f, indent=2)

          # Save sync plan
          with open("/tmp/sync_plan.json", "w") as f:
              json.dump(sync_plan, f, indent=2)

          print(f"Source language sync plan created:")
          print(f"  - {len(sync_plan['files_to_sync'])} markdown files to translate")
          print(f"  - {len(sync_plan['openapi_files_to_sync'])} OpenAPI JSON files to translate")
          if sync_plan['structure_changes'].get('structure_changed'):
              print("  - Documentation structure changes detected")
          EOF

      - name: Upload analysis artifacts
        if: steps.analyze.outputs.has_changes == 'true'
        uses: actions/upload-artifact@v4
        with:
          name: docs-sync-analysis-${{ github.run_id }}
          path: |
            /tmp/analysis.json
            /tmp/changed_files.txt
            /tmp/file_analysis.txt
            /tmp/openapi_analysis.txt
            /tmp/sync_plan.json
            /tmp/docs_json_changed.txt
            /tmp/structure_changes.json
          retention-days: 1