diff --git a/CLAUDE.md b/CLAUDE.md index 83801cd2..782106a8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -128,10 +128,28 @@ SUCCESS: Moved cn/test-file to new location SUCCESS: Moved jp/test-file to new location ``` +## Translation A/B Testing + +For comparing translation quality between models or prompt variations: + +```bash +cd tools/translate-test-dify +./setup.sh +source venv/bin/activate +python run_test.py +python compare.py results// +``` + +**Important**: +- Never commit `results/`, `mock_docs/`, or real API keys +- Always redact keys with `app-***` before committing +- See `tools/translate-test-dify/README.md` for details + ## Key Paths - `docs.json` - Navigation structure - `tools/translate/config.json` - Language configuration (single source of truth) - `tools/translate/termbase_i18n.md` - Translation terminology database - `tools/translate/sync_and_translate.py` - Core translation + surgical reconciliation logic +- `tools/translate-test-dify/` - Translation A/B testing framework - `.github/workflows/sync_docs_*.yml` - Auto-translation workflow triggers diff --git a/tools/translate-test-dify/.gitignore b/tools/translate-test-dify/.gitignore new file mode 100644 index 00000000..a50dbc61 --- /dev/null +++ b/tools/translate-test-dify/.gitignore @@ -0,0 +1,25 @@ +# Test results (generated per run) +results/ + +# Mock docs (copied from en/ for testing) +mock_docs/ + +# Environment files with API keys +.env +*.env + +# Python virtual environment +venv/ +.venv/ + +# Python cache +__pycache__/ +*.pyc +*.pyo + +# IDE +.vscode/ +.idea/ + +# OS files +.DS_Store diff --git a/tools/translate-test-dify/README.md b/tools/translate-test-dify/README.md new file mode 100644 index 00000000..4daa7c5f --- /dev/null +++ b/tools/translate-test-dify/README.md @@ -0,0 +1,92 @@ +# Translation Testing Framework + +A/B testing for Dify translation workflows. Primary user: **Claude Code**. + +## Important + +- **DO NOT commit test results** - `results/` is gitignored +- **DO NOT commit real API keys** - always redact with `app-***` before committing +- **DO NOT commit mock_docs/** - temporary files copied for testing + +## Quick Start + +```bash +# Setup (first time) +./setup.sh +source venv/bin/activate + +# Run test +python run_test.py + +# Compare results +python compare.py results// +``` + +## Test Spec Format + +```markdown +# Test Title + +## keys +app-xxx +Description A + +app-yyy +Description B + +## test_content +(Inline content - Claude Code generates this for each test) + +# OR reference existing file: +## test_file +en/guides/workflow/some-doc.md +``` + +## Workflow + +1. User describes test scenario +2. Claude Code creates spec with `## test_content` tailored to the issue +3. Run: `source venv/bin/activate && python run_test.py spec.md` +4. Analyze: `python compare.py results//` +5. **Redact API keys** with `app-***` before committing + +## Example: Punctuation Test + +```markdown +# Punctuation Test + +## keys +app-*** +Sonnet + +app-*** +Opus + +## test_content +--- +title: Test Doc +--- + +# Test + +Sentence with commas, colons: semicolons; and more. + +- Item one, comma +- Item two; semicolon +``` + +See `example-model-comparison.md` for a complete example. + +## Files + +| File | Purpose | +|------|---------| +| run_test.py | Test runner | +| compare.py | Generate comparison reports | +| example-model-comparison.md | Example test spec | +| results/ | Output (gitignored) | +| mock_docs/ | Temp test files (gitignored) | + +## Language Policy + +All code and documentation in **English** (international project). diff --git a/tools/translate-test-dify/compare.py b/tools/translate-test-dify/compare.py new file mode 100644 index 00000000..1491a7ef --- /dev/null +++ b/tools/translate-test-dify/compare.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +Translation Comparison Utility +Usage: python compare.py +""" + +import sys +import json +import difflib +from pathlib import Path +from datetime import datetime + + +def load_file(path: Path) -> str: + return path.read_text(encoding="utf-8") if path.exists() else "" + + +def save_file(path: Path, content: str): + path.write_text(content, encoding="utf-8") + + +def get_metrics(content: str) -> dict: + lines = content.split("\n") + return {"chars": len(content), "lines": len(lines), "words": len(content.split())} + + +def similarity(t1: str, t2: str) -> float: + if not t1 or not t2: + return 0.0 + return difflib.SequenceMatcher(None, t1, t2).ratio() + + +def excerpt(content: str, max_lines: int = 20) -> str: + lines = content.split("\n") + result = "\n".join(lines[:max_lines]) + if len(lines) > max_lines: + result += f"\n... ({len(lines) - max_lines} more lines)" + return result + + +def generate_md_report(config: dict, variants: dict, translations: dict) -> str: + lines = [ + f"# Comparison Report", + f"**Test:** {config.get('test_name', 'Unknown')}", + f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + f"**Variants:** {', '.join(variants.keys())}", + "", + "## Variants", + "| Variant | Description |", + "|---------|-------------|" + ] + for name, cfg in variants.items(): + lines.append(f"| {name} | {cfg.get('description', '')} |") + lines.append("") + + for lang, var_contents in translations.items(): + lines.append(f"## {lang.upper()}") + lines.append("") + lines.append("| Variant | Chars | Lines |") + lines.append("|---------|-------|-------|") + for var, content in var_contents.items(): + if content: + m = get_metrics(content) + lines.append(f"| {var} | {m['chars']} | {m['lines']} |") + else: + lines.append(f"| {var} | - | - |") + lines.append("") + + vars_list = list(var_contents.keys()) + if len(vars_list) >= 2: + v1, v2 = vars_list[0], vars_list[1] + sim = similarity(var_contents.get(v1, ""), var_contents.get(v2, "")) + lines.append(f"Similarity ({v1} vs {v2}): **{sim:.1%}**") + lines.append("") + + for var, content in var_contents.items(): + if content: + lines.append(f"
{var}\n\n```\n{excerpt(content, 25)}\n```\n
") + lines.append("") + + return "\n".join(lines) + + +def generate_txt_report(config: dict, variants: dict, translations: dict) -> str: + lines = [ + "=" * 50, + "COMPARISON SUMMARY", + "=" * 50, + f"Test: {config.get('test_name', 'Unknown')}", + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + "Variants:" + ] + for name, cfg in variants.items(): + lines.append(f" {name}: {cfg.get('description', '')}") + lines.append("") + + for lang, var_contents in translations.items(): + lines.append(f"{lang.upper()}:") + for var, content in var_contents.items(): + if content: + m = get_metrics(content) + lines.append(f" {var}: {m['chars']} chars, {m['lines']} lines") + else: + lines.append(f" {var}: FAILED") + + vars_list = list(var_contents.keys()) + if len(vars_list) >= 2: + v1, v2 = vars_list[0], vars_list[1] + sim = similarity(var_contents.get(v1, ""), var_contents.get(v2, "")) + lines.append(f" Similarity: {sim:.1%}") + lines.append("") + + return "\n".join(lines) + + +def compare_results(result_dir: str): + result_path = Path(result_dir) + if not result_path.exists(): + print(f"Error: {result_dir} not found") + sys.exit(1) + + config_path = result_path / "config.json" + if not config_path.exists(): + print(f"Error: config.json not found") + sys.exit(1) + + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + + variants = config.get("variants", {}) + target_languages = config.get("target_languages", ["cn"]) + + print(f"Comparing: {result_path.name}") + print(f"Variants: {', '.join(variants.keys())}") + + # Find translation files in variant folders + translations = {} # {lang: {variant: content}} + for lang in target_languages: + translations[lang] = {} + for var in variants: + var_dir = result_path / f"variant_{var}" + if var_dir.exists(): + # Find any *_{lang}.md file + for f in var_dir.glob(f"*_{lang}.md"): + content = load_file(f) + translations[lang][var] = content + if content: + print(f" Loaded: {f.relative_to(result_path)}") + break + + md_report = generate_md_report(config, variants, translations) + txt_report = generate_txt_report(config, variants, translations) + + save_file(result_path / "comparison.md", md_report) + save_file(result_path / "comparison.txt", txt_report) + + print(f"\nGenerated: comparison.md, comparison.txt") + + +def main(): + if len(sys.argv) < 2: + print("Usage: python compare.py ") + sys.exit(1) + compare_results(sys.argv[1]) + + +if __name__ == "__main__": + main() diff --git a/tools/translate-test-dify/example-model-comparison.md b/tools/translate-test-dify/example-model-comparison.md new file mode 100644 index 00000000..e79656f9 --- /dev/null +++ b/tools/translate-test-dify/example-model-comparison.md @@ -0,0 +1,27 @@ +# Model Comparison Test + +## Background + +Compare translation quality between different models (e.g., Sonnet vs Opus) to evaluate improvements in accuracy, style, and punctuation handling. + +## keys + +app-*** +Model A (e.g., Sonnet original) + +app-*** +Model B (e.g., Opus upgraded) + +## test_file +en/self-host/quick-start/docker-compose.mdx + +## Conclusion + +(Record your findings here after testing, should be filled by AI Agent) + +| Variant | Config | Result | +|---------|--------|--------| +| A | Sonnet | | +| B | Opus | | + +**Recommendation**: (Your recommendation based on test results) diff --git a/tools/translate-test-dify/requirements.txt b/tools/translate-test-dify/requirements.txt new file mode 100644 index 00000000..ae3ec980 --- /dev/null +++ b/tools/translate-test-dify/requirements.txt @@ -0,0 +1,10 @@ +# Translation Testing Framework Dependencies + +# Async HTTP client for streaming API calls +httpx>=0.25.0 + +# Async file I/O +aiofiles>=23.0.0 + +# Optional: Load environment from .env file +python-dotenv>=1.0.0 diff --git a/tools/translate-test-dify/run_test.py b/tools/translate-test-dify/run_test.py new file mode 100644 index 00000000..62b93094 --- /dev/null +++ b/tools/translate-test-dify/run_test.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 +""" +Translation A/B Testing Runner +Usage: python run_test.py [--dry-run] + +Test spec format: + ## keys + app-xxx + Description A + + ## test_content + (inline content to translate) + + OR + + ## test_file + path/to/file.md +""" + +import httpx +import os +import sys +import asyncio +import json +import re +import random +from pathlib import Path +from datetime import datetime + +SCRIPT_DIR = Path(__file__).resolve().parent +RESULTS_DIR = SCRIPT_DIR / "results" +TERMBASE_PATH = SCRIPT_DIR.parent / "translate" / "termbase_i18n.md" +DIFY_API_URL = "https://api.dify.ai/v1/workflows/run" +LANGUAGE_NAMES = {"cn": "Chinese", "jp": "Japanese", "en": "English"} + + +def load_env_file(): + env_path = SCRIPT_DIR / ".env" + if not env_path.exists(): + return + with open(env_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, value = line.split("=", 1) + os.environ[key.strip()] = value.strip().strip("'\"") + + +def parse_markdown_spec(file_path: Path) -> dict: + """Parse markdown test spec""" + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + + config = { + "test_name": file_path.stem, + "target_languages": ["cn"], + "test_content": None, + "test_file": None, + "variants": {} + } + + # Title + title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) + if title_match: + config["test_name"] = re.sub(r'[^a-zA-Z0-9]+', '_', title_match.group(1)).lower().strip('_') + + # Keys + keys_match = re.search(r'##\s*keys\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL | re.IGNORECASE) + if keys_match: + lines = [l.strip() for l in keys_match.group(1).strip().split('\n') if l.strip()] + variant_names = iter('ABCDEFGHIJ') + i = 0 + while i < len(lines): + if lines[i].startswith('app-') and not lines[i].startswith('app-***'): + api_key = lines[i] + description = lines[i + 1] if i + 1 < len(lines) and not lines[i + 1].startswith('app-') else "" + if description: + i += 1 + config["variants"][next(variant_names)] = {"api_key": api_key, "description": description} + i += 1 + + # Target languages + lang_match = re.search(r'##\s*target_languages\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL | re.IGNORECASE) + if lang_match: + langs = [l.strip() for l in lang_match.group(1).strip().split('\n') if l.strip() and not l.startswith('#')] + if langs: + config["target_languages"] = langs + + # Inline test content + content_match = re.search(r'##\s*test_content\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL | re.IGNORECASE) + if content_match: + config["test_content"] = content_match.group(1).strip() + + # Test file path + file_match = re.search(r'##\s*test_file\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL | re.IGNORECASE) + if file_match: + config["test_file"] = file_match.group(1).strip().split('\n')[0].strip() + + return config + + +def is_markdown_spec(file_path: Path) -> bool: + if file_path.suffix in ['.md', '.mdx']: + with open(file_path, "r", encoding="utf-8") as f: + return bool(re.search(r'##\s*keys', f.read(), re.IGNORECASE)) + return False + + +async def load_file(file_path: Path) -> str: + import aiofiles + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + return await f.read() + + +async def save_file(file_path: Path, content: str): + import aiofiles + file_path.parent.mkdir(parents=True, exist_ok=True) + async with aiofiles.open(file_path, "w", encoding="utf-8") as f: + await f.write(content) + + +async def translate_text(content: str, api_key: str, target_language: str, termbase: str, max_retries: int = 3) -> str: + payload = { + "response_mode": "streaming", + "user": "TranslationTest", + "inputs": { + "original_language": "English", + "output_language1": target_language, + "the_doc": content, + "termbase": termbase + } + } + headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} + + for attempt in range(max_retries): + try: + if attempt > 0: + delay = min(30 * (2 ** (attempt - 1)), 300) * random.uniform(0.8, 1.2) + print(f" Retry {attempt + 1}/{max_retries}...") + await asyncio.sleep(delay) + + async with httpx.AsyncClient(timeout=600.0) as client: + async with client.stream("POST", DIFY_API_URL, json=payload, headers=headers) as response: + if response.status_code != 200: + print(f" HTTP {response.status_code}") + if response.status_code in [502, 503, 504] and attempt < max_retries - 1: + continue + return "" + + output1 = None + async for line in response.aiter_lines(): + if line.strip().startswith("data: "): + try: + data = json.loads(line[6:]) + if data.get("event") == "workflow_finished": + output1 = data.get("data", {}).get("outputs", {}).get("output1", "") + elif data.get("event") == "error": + print(f" Error: {data.get('message')}") + return "" + except json.JSONDecodeError: + continue + + if output1: + return output1 + if attempt < max_retries - 1: + continue + return "" + + except (httpx.ReadTimeout, httpx.ConnectTimeout, httpx.HTTPError) as e: + print(f" {type(e).__name__}") + if attempt >= max_retries - 1: + return "" + + return "" + + +async def run_test(config_path: str, dry_run: bool = False): + load_env_file() + + config_file = Path(config_path) + if not config_file.exists(): + print(f"Error: {config_path} not found") + sys.exit(1) + + if not is_markdown_spec(config_file): + print(f"Error: Not a valid test spec (needs ## keys section)") + sys.exit(1) + + print(f"Parsing: {config_path}") + config = parse_markdown_spec(config_file) + + test_name = config.get("test_name", "test") + target_languages = config.get("target_languages", ["cn"]) + variants = config.get("variants", {}) + test_content = config.get("test_content") + test_file = config.get("test_file") + + if not variants: + print("Error: No valid API keys found") + sys.exit(1) + + # Get test content + if test_content: + doc_content = test_content + doc_name = "inline" + elif test_file: + test_file_path = Path(test_file) + if not test_file_path.is_absolute(): + test_file_path = SCRIPT_DIR.parent.parent / test_file + if not test_file_path.exists(): + print(f"Error: Test file not found: {test_file}") + sys.exit(1) + doc_content = await load_file(test_file_path) + doc_name = test_file_path.stem + else: + print("Error: Need ## test_content or ## test_file section") + sys.exit(1) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + result_dir = RESULTS_DIR / f"{timestamp}_{test_name}" + + if not dry_run: + result_dir.mkdir(parents=True, exist_ok=True) + with open(result_dir / "config.json", "w", encoding="utf-8") as f: + json.dump(config, f, indent=2, ensure_ascii=False) + # Save source content + with open(result_dir / f"source_{doc_name}.md", "w", encoding="utf-8") as f: + f.write(doc_content) + + termbase = await load_file(TERMBASE_PATH) if TERMBASE_PATH.exists() else "" + + print(f"\n{'='*50}") + print(f"Test: {test_name}") + print(f"Variants: {', '.join(variants.keys())}") + print(f"Languages: {', '.join(target_languages)}") + print(f"Content: {doc_name} ({len(doc_content)} chars)") + print(f"{'='*50}\n") + + all_results = {} + for var_name, var_config in variants.items(): + api_key = var_config.get("api_key") + desc = var_config.get("description", "") + print(f"{var_name}: {desc}") + + if dry_run: + print(" DRY RUN") + continue + + var_dir = result_dir / f"variant_{var_name}" + var_dir.mkdir(parents=True, exist_ok=True) + + results = {} + for lang in target_languages: + lang_name = LANGUAGE_NAMES.get(lang, lang) + print(f" → {lang_name}...", end=" ", flush=True) + + translated = await translate_text(doc_content, api_key, lang_name, termbase) + if translated: + out_file = var_dir / f"{doc_name}_{lang}.md" + await save_file(out_file, translated) + print(f"OK ({len(translated)} chars)") + results[lang] = {"status": "ok", "chars": len(translated)} + else: + print("FAIL") + results[lang] = {"status": "fail"} + + all_results[var_name] = results + + if not dry_run: + with open(result_dir / "results.json", "w", encoding="utf-8") as f: + json.dump(all_results, f, indent=2, ensure_ascii=False) + print(f"\nResults: {result_dir}") + print(f"Run: python compare.py {result_dir}") + + +def main(): + if len(sys.argv) < 2: + print("Usage: python run_test.py [--dry-run]") + sys.exit(1) + asyncio.run(run_test(sys.argv[1], "--dry-run" in sys.argv)) + + +if __name__ == "__main__": + main() diff --git a/tools/translate-test-dify/setup.sh b/tools/translate-test-dify/setup.sh new file mode 100755 index 00000000..5f924a79 --- /dev/null +++ b/tools/translate-test-dify/setup.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Translation Test Framework Setup Script +# Creates Python virtual environment and installs dependencies + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +echo "===================================" +echo "Translation Test Framework Setup" +echo "===================================" +echo "" + +# Check Python version +if ! command -v python3 &> /dev/null; then + echo "Error: Python 3 is required but not found" + exit 1 +fi + +PYTHON_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') +echo "Python version: $PYTHON_VERSION" + +# Create virtual environment +if [ -d "venv" ]; then + echo "Virtual environment already exists" +else + echo "Creating virtual environment..." + python3 -m venv venv + echo "Virtual environment created" +fi + +# Activate and install dependencies +echo "Installing dependencies..." +source venv/bin/activate +pip install --upgrade pip -q +pip install -r requirements.txt -q + +echo "" +echo "===================================" +echo "Setup Complete!" +echo "===================================" +echo "" +echo "To activate the environment:" +echo " source venv/bin/activate" +echo "" +echo "To run a test:" +echo " python run_test.py " +echo "" +echo "Example:" +echo " python run_test.py base-20251127.md" +echo ""