dify-docs/tools/translate/json_formatter.py

"""
Format-preserving JSON serialization utilities.

This module detects and preserves the exact formatting of existing JSON files,
allowing surgical edits without reformatting the entire file.
"""

import json
import re
from typing import Any, Dict, Optional, Tuple
from pathlib import Path


class JSONFormat:
    """Detected JSON formatting style"""

    def __init__(self):
        self.indent_char = ' '  # ' ' or '\t'
        self.indent_size = 4  # Number of indent chars per level
        self.indent_pattern = 'consistent'  # 'consistent' or 'mixed'
        self.indent_increments = [4]  # List of space counts per level
        self.trailing_newline = True
        self.key_spacing = True  # Space after colon: "key": value vs "key":value

    def __repr__(self):
        return (f"JSONFormat(char={repr(self.indent_char)}, "
                f"size={self.indent_size}, pattern={self.indent_pattern}, "
                f"increments={self.indent_increments})")


def detect_json_format(file_path: str) -> JSONFormat:
    """
    Detect the formatting style of an existing JSON file.

    Analyzes indentation pattern, whitespace, and structural formatting
    to enable format-preserving edits.
    """
    fmt = JSONFormat()

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        lines = content.split('\n')

    # Check trailing newline
    fmt.trailing_newline = content.endswith('\n')

    # Detect indent character and pattern by tracking absolute indent levels
    indent_levels = {}  # Maps absolute space count to frequency

    for line_num, line in enumerate(lines[:300]):  # Sample first 300 lines
        if not line.strip() or line.strip().startswith('//'):
            continue

        # Count leading whitespace
        stripped = line.lstrip(' \t')
        if not stripped:
            continue

        spaces = len(line) - len(stripped)
        tabs = line[:spaces].count('\t')

        # Detect tab vs space
        if tabs > 0:
            fmt.indent_char = '\t'
            indent_count = tabs
        else:
            indent_count = spaces

        if indent_count > 0:
            indent_levels[indent_count] = indent_levels.get(indent_count, 0) + 1

    if not indent_levels:
        # Fallback to default
        return fmt

    # Sort indent levels to build the actual progression
    sorted_levels = sorted(indent_levels.keys())

    # Build increment pattern from actual levels seen
    increments = []
    if sorted_levels:
        prev_level = 0
        for level in sorted_levels:
            increment = level - prev_level
            increments.append(increment)
            prev_level = level

    # Check if consistent (all increments the same)
    unique_increments = list(set(increments))

    if len(unique_increments) == 1:
        fmt.indent_pattern = 'consistent'
        fmt.indent_size = unique_increments[0]
        fmt.indent_increments = [unique_increments[0]]
    else:
        fmt.indent_pattern = 'mixed'
        fmt.indent_increments = increments

    # Detect key spacing (": " vs ":")
    colon_samples = [line for line in lines[:100] if '":' in line]
    if colon_samples:
        with_space = sum(1 for line in colon_samples if '": ' in line)
        fmt.key_spacing = with_space > len(colon_samples) // 2

    return fmt


def get_indent_for_level(fmt: JSONFormat, level: int) -> str:
    """
    Get the indent string for a specific nesting level.
    Handles both consistent and mixed indent patterns.
    """
    if level == 0:
        return ''

    if fmt.indent_pattern == 'consistent':
        count = fmt.indent_size * level
    else:
        # For mixed patterns, sum up increments up to this level
        # increments[0] is the increment from level 0 to level 1
        # increments[1] is the increment from level 1 to level 2, etc.
        count = 0
        for i in range(level):
            if i < len(fmt.indent_increments):
                count += fmt.indent_increments[i]
            else:
                # If we run out of recorded increments, use the last one
                count += fmt.indent_increments[-1] if fmt.indent_increments else 2

    return fmt.indent_char * count


def format_preserving_json_dump(data: Any, fmt: JSONFormat, level: int = 0) -> str:
    """
    Serialize JSON data while preserving the detected formatting style.

    This custom serializer respects:
    - Detected indent pattern (consistent vs mixed)
    - Space vs tab indentation
    - Key spacing preferences
    - Trailing newline conventions

    Note: level indicates the nesting depth of the current structure's opening brace.
    """
    indent = get_indent_for_level(fmt, level)
    child_indent = get_indent_for_level(fmt, level + 1)
    colon = ': ' if fmt.key_spacing else ':'

    if isinstance(data, dict):
        if not data:
            return '{}'

        lines = ['{']
        items = list(data.items())

        for i, (key, value) in enumerate(items):
            is_last = (i == len(items) - 1)
            # Serialize child values at the same structural level (they'll handle their own nesting)
            serialized_value = format_preserving_json_dump(value, fmt, level + 1)

            # Check if value is multiline
            if '\n' in serialized_value:
                # Multiline value (object or array) - needs special handling
                value_lines = serialized_value.split('\n')
                comma = '' if is_last else ','
                # First line goes on same line as key
                lines.append(f'{child_indent}"{key}"{colon}{value_lines[0]}')
                # Remaining lines keep their indentation
                for vline in value_lines[1:-1]:
                    lines.append(vline)
                # Last line gets the comma
                lines.append(value_lines[-1] + comma)
            else:
                # Single line value
                comma = '' if is_last else ','
                lines.append(f'{child_indent}"{key}"{colon}{serialized_value}{comma}')

        lines.append(f'{indent}}}')
        return '\n'.join(lines)

    elif isinstance(data, list):
        if not data:
            return '[]'

        lines = ['[']

        for i, item in enumerate(data):
            is_last = (i == len(data) - 1)
            serialized_item = format_preserving_json_dump(item, fmt, level + 1)

            # Check if item is multiline
            if '\n' in serialized_item:
                # Multiline item needs proper indentation
                item_lines = serialized_item.split('\n')
                comma = '' if is_last else ','
                # First line gets child indent
                lines.append(f'{child_indent}{item_lines[0]}')
                # Remaining lines keep their indentation
                for iline in item_lines[1:-1]:
                    lines.append(iline)
                # Last line gets the comma
                lines.append(item_lines[-1] + comma)
            else:
                # Single line item
                comma = '' if is_last else ','
                lines.append(f'{child_indent}{serialized_item}{comma}')

        lines.append(f'{indent}]')
        return '\n'.join(lines)

    elif isinstance(data, str):
        # Escape special characters
        escaped = json.dumps(data, ensure_ascii=False)
        return escaped

    elif isinstance(data, bool):
        return 'true' if data else 'false'

    elif data is None:
        return 'null'

    elif isinstance(data, (int, float)):
        return str(data)

    else:
        # Fallback to standard JSON serialization
        return json.dumps(data, ensure_ascii=False)


def save_json_with_preserved_format(file_path: str, data: Dict[str, Any],
                                   reference_file: Optional[str] = None) -> bool:
    """
    Save JSON data to file while preserving the original formatting style.

    Args:
        file_path: Path to JSON file to write
        data: Dictionary to serialize
        reference_file: Optional path to reference file for format detection.
                       If not provided, uses file_path for detection.

    Returns:
        True if successful, False otherwise
    """
    try:
        # Detect format from reference file or existing target file
        format_source = reference_file if reference_file else file_path

        if Path(format_source).exists():
            fmt = detect_json_format(format_source)
        else:
            # Use sensible defaults for new files
            fmt = JSONFormat()
            fmt.indent_size = 4
            fmt.indent_pattern = 'consistent'

        # Serialize with preserved format
        content = format_preserving_json_dump(data, fmt, level=0)

        # Add trailing newline if detected in original
        if fmt.trailing_newline and not content.endswith('\n'):
            content += '\n'

        # Write to file
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

        return True

    except Exception as e:
        print(f"Error saving JSON with preserved format: {e}")
        return False


def validate_format_preservation(original_path: str, new_path: str) -> Dict[str, Any]:
    """
    Validate that formatting was preserved between two JSON files.

    Returns a report with:
    - matching: bool (whether formats match)
    - differences: list of detected differences
    - original_format: detected format from original
    - new_format: detected format from new file
    """
    original_fmt = detect_json_format(original_path)
    new_fmt = detect_json_format(new_path)

    differences = []

    if original_fmt.indent_char != new_fmt.indent_char:
        differences.append(f"Indent char: {repr(original_fmt.indent_char)} → {repr(new_fmt.indent_char)}")

    if original_fmt.indent_pattern != new_fmt.indent_pattern:
        differences.append(f"Indent pattern: {original_fmt.indent_pattern} → {new_fmt.indent_pattern}")

    if original_fmt.indent_size != new_fmt.indent_size:
        differences.append(f"Indent size: {original_fmt.indent_size} → {new_fmt.indent_size}")

    if original_fmt.trailing_newline != new_fmt.trailing_newline:
        differences.append(f"Trailing newline: {original_fmt.trailing_newline} → {new_fmt.trailing_newline}")

    return {
        'matching': len(differences) == 0,
        'differences': differences,
        'original_format': original_fmt,
        'new_format': new_fmt
    }