mirror of
https://github.com/langgenius/dify-docs.git
synced 2026-03-26 13:18:34 +07:00
307 lines
10 KiB
Python
307 lines
10 KiB
Python
"""
|
|
Format-preserving JSON serialization utilities.
|
|
|
|
This module detects and preserves the exact formatting of existing JSON files,
|
|
allowing surgical edits without reformatting the entire file.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from typing import Any, Dict, Optional, Tuple
|
|
from pathlib import Path
|
|
|
|
|
|
class JSONFormat:
|
|
"""Detected JSON formatting style"""
|
|
|
|
def __init__(self):
|
|
self.indent_char = ' ' # ' ' or '\t'
|
|
self.indent_size = 4 # Number of indent chars per level
|
|
self.indent_pattern = 'consistent' # 'consistent' or 'mixed'
|
|
self.indent_increments = [4] # List of space counts per level
|
|
self.trailing_newline = True
|
|
self.key_spacing = True # Space after colon: "key": value vs "key":value
|
|
|
|
def __repr__(self):
|
|
return (f"JSONFormat(char={repr(self.indent_char)}, "
|
|
f"size={self.indent_size}, pattern={self.indent_pattern}, "
|
|
f"increments={self.indent_increments})")
|
|
|
|
|
|
def detect_json_format(file_path: str) -> JSONFormat:
|
|
"""
|
|
Detect the formatting style of an existing JSON file.
|
|
|
|
Analyzes indentation pattern, whitespace, and structural formatting
|
|
to enable format-preserving edits.
|
|
"""
|
|
fmt = JSONFormat()
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
lines = content.split('\n')
|
|
|
|
# Check trailing newline
|
|
fmt.trailing_newline = content.endswith('\n')
|
|
|
|
# Detect indent character and pattern by tracking absolute indent levels
|
|
indent_levels = {} # Maps absolute space count to frequency
|
|
|
|
for line_num, line in enumerate(lines[:300]): # Sample first 300 lines
|
|
if not line.strip() or line.strip().startswith('//'):
|
|
continue
|
|
|
|
# Count leading whitespace
|
|
stripped = line.lstrip(' \t')
|
|
if not stripped:
|
|
continue
|
|
|
|
spaces = len(line) - len(stripped)
|
|
tabs = line[:spaces].count('\t')
|
|
|
|
# Detect tab vs space
|
|
if tabs > 0:
|
|
fmt.indent_char = '\t'
|
|
indent_count = tabs
|
|
else:
|
|
indent_count = spaces
|
|
|
|
if indent_count > 0:
|
|
indent_levels[indent_count] = indent_levels.get(indent_count, 0) + 1
|
|
|
|
if not indent_levels:
|
|
# Fallback to default
|
|
return fmt
|
|
|
|
# Sort indent levels to build the actual progression
|
|
sorted_levels = sorted(indent_levels.keys())
|
|
|
|
# Build increment pattern from actual levels seen
|
|
increments = []
|
|
if sorted_levels:
|
|
prev_level = 0
|
|
for level in sorted_levels:
|
|
increment = level - prev_level
|
|
increments.append(increment)
|
|
prev_level = level
|
|
|
|
# Check if consistent (all increments the same)
|
|
unique_increments = list(set(increments))
|
|
|
|
if len(unique_increments) == 1:
|
|
fmt.indent_pattern = 'consistent'
|
|
fmt.indent_size = unique_increments[0]
|
|
fmt.indent_increments = [unique_increments[0]]
|
|
else:
|
|
fmt.indent_pattern = 'mixed'
|
|
fmt.indent_increments = increments
|
|
|
|
# Detect key spacing (": " vs ":")
|
|
colon_samples = [line for line in lines[:100] if '":' in line]
|
|
if colon_samples:
|
|
with_space = sum(1 for line in colon_samples if '": ' in line)
|
|
fmt.key_spacing = with_space > len(colon_samples) // 2
|
|
|
|
return fmt
|
|
|
|
|
|
def get_indent_for_level(fmt: JSONFormat, level: int) -> str:
|
|
"""
|
|
Get the indent string for a specific nesting level.
|
|
Handles both consistent and mixed indent patterns.
|
|
"""
|
|
if level == 0:
|
|
return ''
|
|
|
|
if fmt.indent_pattern == 'consistent':
|
|
count = fmt.indent_size * level
|
|
else:
|
|
# For mixed patterns, sum up increments up to this level
|
|
# increments[0] is the increment from level 0 to level 1
|
|
# increments[1] is the increment from level 1 to level 2, etc.
|
|
count = 0
|
|
for i in range(level):
|
|
if i < len(fmt.indent_increments):
|
|
count += fmt.indent_increments[i]
|
|
else:
|
|
# If we run out of recorded increments, use the last one
|
|
count += fmt.indent_increments[-1] if fmt.indent_increments else 2
|
|
|
|
return fmt.indent_char * count
|
|
|
|
|
|
def format_preserving_json_dump(data: Any, fmt: JSONFormat, level: int = 0) -> str:
|
|
"""
|
|
Serialize JSON data while preserving the detected formatting style.
|
|
|
|
This custom serializer respects:
|
|
- Detected indent pattern (consistent vs mixed)
|
|
- Space vs tab indentation
|
|
- Key spacing preferences
|
|
- Trailing newline conventions
|
|
|
|
Note: level indicates the nesting depth of the current structure's opening brace.
|
|
"""
|
|
indent = get_indent_for_level(fmt, level)
|
|
child_indent = get_indent_for_level(fmt, level + 1)
|
|
colon = ': ' if fmt.key_spacing else ':'
|
|
|
|
if isinstance(data, dict):
|
|
if not data:
|
|
return '{}'
|
|
|
|
lines = ['{']
|
|
items = list(data.items())
|
|
|
|
for i, (key, value) in enumerate(items):
|
|
is_last = (i == len(items) - 1)
|
|
# Serialize child values at the same structural level (they'll handle their own nesting)
|
|
serialized_value = format_preserving_json_dump(value, fmt, level + 1)
|
|
|
|
# Check if value is multiline
|
|
if '\n' in serialized_value:
|
|
# Multiline value (object or array) - needs special handling
|
|
value_lines = serialized_value.split('\n')
|
|
comma = '' if is_last else ','
|
|
# First line goes on same line as key
|
|
lines.append(f'{child_indent}"{key}"{colon}{value_lines[0]}')
|
|
# Remaining lines keep their indentation
|
|
for vline in value_lines[1:-1]:
|
|
lines.append(vline)
|
|
# Last line gets the comma
|
|
lines.append(value_lines[-1] + comma)
|
|
else:
|
|
# Single line value
|
|
comma = '' if is_last else ','
|
|
lines.append(f'{child_indent}"{key}"{colon}{serialized_value}{comma}')
|
|
|
|
lines.append(f'{indent}}}')
|
|
return '\n'.join(lines)
|
|
|
|
elif isinstance(data, list):
|
|
if not data:
|
|
return '[]'
|
|
|
|
lines = ['[']
|
|
|
|
for i, item in enumerate(data):
|
|
is_last = (i == len(data) - 1)
|
|
serialized_item = format_preserving_json_dump(item, fmt, level + 1)
|
|
|
|
# Check if item is multiline
|
|
if '\n' in serialized_item:
|
|
# Multiline item needs proper indentation
|
|
item_lines = serialized_item.split('\n')
|
|
comma = '' if is_last else ','
|
|
# First line gets child indent
|
|
lines.append(f'{child_indent}{item_lines[0]}')
|
|
# Remaining lines keep their indentation
|
|
for iline in item_lines[1:-1]:
|
|
lines.append(iline)
|
|
# Last line gets the comma
|
|
lines.append(item_lines[-1] + comma)
|
|
else:
|
|
# Single line item
|
|
comma = '' if is_last else ','
|
|
lines.append(f'{child_indent}{serialized_item}{comma}')
|
|
|
|
lines.append(f'{indent}]')
|
|
return '\n'.join(lines)
|
|
|
|
elif isinstance(data, str):
|
|
# Escape special characters
|
|
escaped = json.dumps(data, ensure_ascii=False)
|
|
return escaped
|
|
|
|
elif isinstance(data, bool):
|
|
return 'true' if data else 'false'
|
|
|
|
elif data is None:
|
|
return 'null'
|
|
|
|
elif isinstance(data, (int, float)):
|
|
return str(data)
|
|
|
|
else:
|
|
# Fallback to standard JSON serialization
|
|
return json.dumps(data, ensure_ascii=False)
|
|
|
|
|
|
def save_json_with_preserved_format(file_path: str, data: Dict[str, Any],
|
|
reference_file: Optional[str] = None) -> bool:
|
|
"""
|
|
Save JSON data to file while preserving the original formatting style.
|
|
|
|
Args:
|
|
file_path: Path to JSON file to write
|
|
data: Dictionary to serialize
|
|
reference_file: Optional path to reference file for format detection.
|
|
If not provided, uses file_path for detection.
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Detect format from reference file or existing target file
|
|
format_source = reference_file if reference_file else file_path
|
|
|
|
if Path(format_source).exists():
|
|
fmt = detect_json_format(format_source)
|
|
else:
|
|
# Use sensible defaults for new files
|
|
fmt = JSONFormat()
|
|
fmt.indent_size = 4
|
|
fmt.indent_pattern = 'consistent'
|
|
|
|
# Serialize with preserved format
|
|
content = format_preserving_json_dump(data, fmt, level=0)
|
|
|
|
# Add trailing newline if detected in original
|
|
if fmt.trailing_newline and not content.endswith('\n'):
|
|
content += '\n'
|
|
|
|
# Write to file
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error saving JSON with preserved format: {e}")
|
|
return False
|
|
|
|
|
|
def validate_format_preservation(original_path: str, new_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Validate that formatting was preserved between two JSON files.
|
|
|
|
Returns a report with:
|
|
- matching: bool (whether formats match)
|
|
- differences: list of detected differences
|
|
- original_format: detected format from original
|
|
- new_format: detected format from new file
|
|
"""
|
|
original_fmt = detect_json_format(original_path)
|
|
new_fmt = detect_json_format(new_path)
|
|
|
|
differences = []
|
|
|
|
if original_fmt.indent_char != new_fmt.indent_char:
|
|
differences.append(f"Indent char: {repr(original_fmt.indent_char)} → {repr(new_fmt.indent_char)}")
|
|
|
|
if original_fmt.indent_pattern != new_fmt.indent_pattern:
|
|
differences.append(f"Indent pattern: {original_fmt.indent_pattern} → {new_fmt.indent_pattern}")
|
|
|
|
if original_fmt.indent_size != new_fmt.indent_size:
|
|
differences.append(f"Indent size: {original_fmt.indent_size} → {new_fmt.indent_size}")
|
|
|
|
if original_fmt.trailing_newline != new_fmt.trailing_newline:
|
|
differences.append(f"Trailing newline: {original_fmt.trailing_newline} → {new_fmt.trailing_newline}")
|
|
|
|
return {
|
|
'matching': len(differences) == 0,
|
|
'differences': differences,
|
|
'original_format': original_fmt,
|
|
'new_format': new_fmt
|
|
}
|