Files
dify-docs/tools/translate/json_formatter.py
2025-11-26 04:46:31 -08:00

307 lines
10 KiB
Python

"""
Format-preserving JSON serialization utilities.
This module detects and preserves the exact formatting of existing JSON files,
allowing surgical edits without reformatting the entire file.
"""
import json
import re
from typing import Any, Dict, Optional, Tuple
from pathlib import Path
class JSONFormat:
"""Detected JSON formatting style"""
def __init__(self):
self.indent_char = ' ' # ' ' or '\t'
self.indent_size = 4 # Number of indent chars per level
self.indent_pattern = 'consistent' # 'consistent' or 'mixed'
self.indent_increments = [4] # List of space counts per level
self.trailing_newline = True
self.key_spacing = True # Space after colon: "key": value vs "key":value
def __repr__(self):
return (f"JSONFormat(char={repr(self.indent_char)}, "
f"size={self.indent_size}, pattern={self.indent_pattern}, "
f"increments={self.indent_increments})")
def detect_json_format(file_path: str) -> JSONFormat:
"""
Detect the formatting style of an existing JSON file.
Analyzes indentation pattern, whitespace, and structural formatting
to enable format-preserving edits.
"""
fmt = JSONFormat()
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.split('\n')
# Check trailing newline
fmt.trailing_newline = content.endswith('\n')
# Detect indent character and pattern by tracking absolute indent levels
indent_levels = {} # Maps absolute space count to frequency
for line_num, line in enumerate(lines[:300]): # Sample first 300 lines
if not line.strip() or line.strip().startswith('//'):
continue
# Count leading whitespace
stripped = line.lstrip(' \t')
if not stripped:
continue
spaces = len(line) - len(stripped)
tabs = line[:spaces].count('\t')
# Detect tab vs space
if tabs > 0:
fmt.indent_char = '\t'
indent_count = tabs
else:
indent_count = spaces
if indent_count > 0:
indent_levels[indent_count] = indent_levels.get(indent_count, 0) + 1
if not indent_levels:
# Fallback to default
return fmt
# Sort indent levels to build the actual progression
sorted_levels = sorted(indent_levels.keys())
# Build increment pattern from actual levels seen
increments = []
if sorted_levels:
prev_level = 0
for level in sorted_levels:
increment = level - prev_level
increments.append(increment)
prev_level = level
# Check if consistent (all increments the same)
unique_increments = list(set(increments))
if len(unique_increments) == 1:
fmt.indent_pattern = 'consistent'
fmt.indent_size = unique_increments[0]
fmt.indent_increments = [unique_increments[0]]
else:
fmt.indent_pattern = 'mixed'
fmt.indent_increments = increments
# Detect key spacing (": " vs ":")
colon_samples = [line for line in lines[:100] if '":' in line]
if colon_samples:
with_space = sum(1 for line in colon_samples if '": ' in line)
fmt.key_spacing = with_space > len(colon_samples) // 2
return fmt
def get_indent_for_level(fmt: JSONFormat, level: int) -> str:
"""
Get the indent string for a specific nesting level.
Handles both consistent and mixed indent patterns.
"""
if level == 0:
return ''
if fmt.indent_pattern == 'consistent':
count = fmt.indent_size * level
else:
# For mixed patterns, sum up increments up to this level
# increments[0] is the increment from level 0 to level 1
# increments[1] is the increment from level 1 to level 2, etc.
count = 0
for i in range(level):
if i < len(fmt.indent_increments):
count += fmt.indent_increments[i]
else:
# If we run out of recorded increments, use the last one
count += fmt.indent_increments[-1] if fmt.indent_increments else 2
return fmt.indent_char * count
def format_preserving_json_dump(data: Any, fmt: JSONFormat, level: int = 0) -> str:
"""
Serialize JSON data while preserving the detected formatting style.
This custom serializer respects:
- Detected indent pattern (consistent vs mixed)
- Space vs tab indentation
- Key spacing preferences
- Trailing newline conventions
Note: level indicates the nesting depth of the current structure's opening brace.
"""
indent = get_indent_for_level(fmt, level)
child_indent = get_indent_for_level(fmt, level + 1)
colon = ': ' if fmt.key_spacing else ':'
if isinstance(data, dict):
if not data:
return '{}'
lines = ['{']
items = list(data.items())
for i, (key, value) in enumerate(items):
is_last = (i == len(items) - 1)
# Serialize child values at the same structural level (they'll handle their own nesting)
serialized_value = format_preserving_json_dump(value, fmt, level + 1)
# Check if value is multiline
if '\n' in serialized_value:
# Multiline value (object or array) - needs special handling
value_lines = serialized_value.split('\n')
comma = '' if is_last else ','
# First line goes on same line as key
lines.append(f'{child_indent}"{key}"{colon}{value_lines[0]}')
# Remaining lines keep their indentation
for vline in value_lines[1:-1]:
lines.append(vline)
# Last line gets the comma
lines.append(value_lines[-1] + comma)
else:
# Single line value
comma = '' if is_last else ','
lines.append(f'{child_indent}"{key}"{colon}{serialized_value}{comma}')
lines.append(f'{indent}}}')
return '\n'.join(lines)
elif isinstance(data, list):
if not data:
return '[]'
lines = ['[']
for i, item in enumerate(data):
is_last = (i == len(data) - 1)
serialized_item = format_preserving_json_dump(item, fmt, level + 1)
# Check if item is multiline
if '\n' in serialized_item:
# Multiline item needs proper indentation
item_lines = serialized_item.split('\n')
comma = '' if is_last else ','
# First line gets child indent
lines.append(f'{child_indent}{item_lines[0]}')
# Remaining lines keep their indentation
for iline in item_lines[1:-1]:
lines.append(iline)
# Last line gets the comma
lines.append(item_lines[-1] + comma)
else:
# Single line item
comma = '' if is_last else ','
lines.append(f'{child_indent}{serialized_item}{comma}')
lines.append(f'{indent}]')
return '\n'.join(lines)
elif isinstance(data, str):
# Escape special characters
escaped = json.dumps(data, ensure_ascii=False)
return escaped
elif isinstance(data, bool):
return 'true' if data else 'false'
elif data is None:
return 'null'
elif isinstance(data, (int, float)):
return str(data)
else:
# Fallback to standard JSON serialization
return json.dumps(data, ensure_ascii=False)
def save_json_with_preserved_format(file_path: str, data: Dict[str, Any],
reference_file: Optional[str] = None) -> bool:
"""
Save JSON data to file while preserving the original formatting style.
Args:
file_path: Path to JSON file to write
data: Dictionary to serialize
reference_file: Optional path to reference file for format detection.
If not provided, uses file_path for detection.
Returns:
True if successful, False otherwise
"""
try:
# Detect format from reference file or existing target file
format_source = reference_file if reference_file else file_path
if Path(format_source).exists():
fmt = detect_json_format(format_source)
else:
# Use sensible defaults for new files
fmt = JSONFormat()
fmt.indent_size = 4
fmt.indent_pattern = 'consistent'
# Serialize with preserved format
content = format_preserving_json_dump(data, fmt, level=0)
# Add trailing newline if detected in original
if fmt.trailing_newline and not content.endswith('\n'):
content += '\n'
# Write to file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"Error saving JSON with preserved format: {e}")
return False
def validate_format_preservation(original_path: str, new_path: str) -> Dict[str, Any]:
"""
Validate that formatting was preserved between two JSON files.
Returns a report with:
- matching: bool (whether formats match)
- differences: list of detected differences
- original_format: detected format from original
- new_format: detected format from new file
"""
original_fmt = detect_json_format(original_path)
new_fmt = detect_json_format(new_path)
differences = []
if original_fmt.indent_char != new_fmt.indent_char:
differences.append(f"Indent char: {repr(original_fmt.indent_char)}{repr(new_fmt.indent_char)}")
if original_fmt.indent_pattern != new_fmt.indent_pattern:
differences.append(f"Indent pattern: {original_fmt.indent_pattern}{new_fmt.indent_pattern}")
if original_fmt.indent_size != new_fmt.indent_size:
differences.append(f"Indent size: {original_fmt.indent_size}{new_fmt.indent_size}")
if original_fmt.trailing_newline != new_fmt.trailing_newline:
differences.append(f"Trailing newline: {original_fmt.trailing_newline}{new_fmt.trailing_newline}")
return {
'matching': len(differences) == 0,
'differences': differences,
'original_format': original_fmt,
'new_format': new_fmt
}