feat: add translation A/B testing framework (#564)

Add tools for comparing translation quality between different models (e.g., Sonnet vs Opus) or prompt variations. Useful for evaluating translation improvements before deploying changes. - run_test.py: Test runner with Dify API streaming - compare.py: Generate similarity reports between variants - Example spec and documentation included 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
2026-03-27 13:28:32 +07:00 · 2025-11-28 00:02:13 -08:00
parent 1b2f5edc6a
commit 4339f79a55
8 changed files with 677 additions and 0 deletions
--- a/tools/translate-test-dify/.gitignore
+++ b/tools/translate-test-dify/.gitignore
@@ -0,0 +1,25 @@
+# Test results (generated per run)
+results/
+
+# Mock docs (copied from en/ for testing)
+mock_docs/
+
+# Environment files with API keys
+.env
+*.env
+
+# Python virtual environment
+venv/
+.venv/
+
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+
+# IDE
+.vscode/
+.idea/
+
+# OS files
+.DS_Store
--- a/tools/translate-test-dify/README.md
+++ b/tools/translate-test-dify/README.md
@@ -0,0 +1,92 @@
+# Translation Testing Framework
+
+A/B testing for Dify translation workflows. Primary user: **Claude Code**.
+
+## Important
+
+- **DO NOT commit test results** - `results/` is gitignored
+- **DO NOT commit real API keys** - always redact with `app-***` before committing
+- **DO NOT commit mock_docs/** - temporary files copied for testing
+
+## Quick Start
+
+```bash
+# Setup (first time)
+./setup.sh
+source venv/bin/activate
+
+# Run test
+python run_test.py <spec.md>
+
+# Compare results
+python compare.py results/<folder>/
+```
+
+## Test Spec Format
+
+```markdown
+# Test Title
+
+## keys
+app-xxx
+Description A
+
+app-yyy
+Description B
+
+## test_content
+(Inline content - Claude Code generates this for each test)
+
+# OR reference existing file:
+## test_file
+en/guides/workflow/some-doc.md
+```
+
+## Workflow
+
+1. User describes test scenario
+2. Claude Code creates spec with `## test_content` tailored to the issue
+3. Run: `source venv/bin/activate && python run_test.py spec.md`
+4. Analyze: `python compare.py results/<folder>/`
+5. **Redact API keys** with `app-***` before committing
+
+## Example: Punctuation Test
+
+```markdown
+# Punctuation Test
+
+## keys
+app-***
+Sonnet
+
+app-***
+Opus
+
+## test_content
+---
+title: Test Doc
+---
+
+# Test
+
+Sentence with commas, colons: semicolons; and more.
+
+- Item one, comma
+- Item two; semicolon
+```
+
+See `example-model-comparison.md` for a complete example.
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| run_test.py | Test runner |
+| compare.py | Generate comparison reports |
+| example-model-comparison.md | Example test spec |
+| results/ | Output (gitignored) |
+| mock_docs/ | Temp test files (gitignored) |
+
+## Language Policy
+
+All code and documentation in **English** (international project).
--- a/tools/translate-test-dify/compare.py
+++ b/tools/translate-test-dify/compare.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Translation Comparison Utility
+Usage: python compare.py <results_folder>
+"""
+
+import sys
+import json
+import difflib
+from pathlib import Path
+from datetime import datetime
+
+
+def load_file(path: Path) -> str:
+    return path.read_text(encoding="utf-8") if path.exists() else ""
+
+
+def save_file(path: Path, content: str):
+    path.write_text(content, encoding="utf-8")
+
+
+def get_metrics(content: str) -> dict:
+    lines = content.split("\n")
+    return {"chars": len(content), "lines": len(lines), "words": len(content.split())}
+
+
+def similarity(t1: str, t2: str) -> float:
+    if not t1 or not t2:
+        return 0.0
+    return difflib.SequenceMatcher(None, t1, t2).ratio()
+
+
+def excerpt(content: str, max_lines: int = 20) -> str:
+    lines = content.split("\n")
+    result = "\n".join(lines[:max_lines])
+    if len(lines) > max_lines:
+        result += f"\n... ({len(lines) - max_lines} more lines)"
+    return result
+
+
+def generate_md_report(config: dict, variants: dict, translations: dict) -> str:
+    lines = [
+        f"# Comparison Report",
+        f"**Test:** {config.get('test_name', 'Unknown')}",
+        f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        f"**Variants:** {', '.join(variants.keys())}",
+        "",
+        "## Variants",
+        "| Variant | Description |",
+        "|---------|-------------|"
+    ]
+    for name, cfg in variants.items():
+        lines.append(f"| {name} | {cfg.get('description', '')} |")
+    lines.append("")
+
+    for lang, var_contents in translations.items():
+        lines.append(f"## {lang.upper()}")
+        lines.append("")
+        lines.append("| Variant | Chars | Lines |")
+        lines.append("|---------|-------|-------|")
+        for var, content in var_contents.items():
+            if content:
+                m = get_metrics(content)
+                lines.append(f"| {var} | {m['chars']} | {m['lines']} |")
+            else:
+                lines.append(f"| {var} | - | - |")
+        lines.append("")
+
+        vars_list = list(var_contents.keys())
+        if len(vars_list) >= 2:
+            v1, v2 = vars_list[0], vars_list[1]
+            sim = similarity(var_contents.get(v1, ""), var_contents.get(v2, ""))
+            lines.append(f"Similarity ({v1} vs {v2}): **{sim:.1%}**")
+            lines.append("")
+
+        for var, content in var_contents.items():
+            if content:
+                lines.append(f"<details><summary>{var}</summary>\n\n```\n{excerpt(content, 25)}\n```\n</details>")
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def generate_txt_report(config: dict, variants: dict, translations: dict) -> str:
+    lines = [
+        "=" * 50,
+        "COMPARISON SUMMARY",
+        "=" * 50,
+        f"Test: {config.get('test_name', 'Unknown')}",
+        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        "",
+        "Variants:"
+    ]
+    for name, cfg in variants.items():
+        lines.append(f"  {name}: {cfg.get('description', '')}")
+    lines.append("")
+
+    for lang, var_contents in translations.items():
+        lines.append(f"{lang.upper()}:")
+        for var, content in var_contents.items():
+            if content:
+                m = get_metrics(content)
+                lines.append(f"  {var}: {m['chars']} chars, {m['lines']} lines")
+            else:
+                lines.append(f"  {var}: FAILED")
+
+        vars_list = list(var_contents.keys())
+        if len(vars_list) >= 2:
+            v1, v2 = vars_list[0], vars_list[1]
+            sim = similarity(var_contents.get(v1, ""), var_contents.get(v2, ""))
+            lines.append(f"  Similarity: {sim:.1%}")
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def compare_results(result_dir: str):
+    result_path = Path(result_dir)
+    if not result_path.exists():
+        print(f"Error: {result_dir} not found")
+        sys.exit(1)
+
+    config_path = result_path / "config.json"
+    if not config_path.exists():
+        print(f"Error: config.json not found")
+        sys.exit(1)
+
+    with open(config_path, "r", encoding="utf-8") as f:
+        config = json.load(f)
+
+    variants = config.get("variants", {})
+    target_languages = config.get("target_languages", ["cn"])
+
+    print(f"Comparing: {result_path.name}")
+    print(f"Variants: {', '.join(variants.keys())}")
+
+    # Find translation files in variant folders
+    translations = {}  # {lang: {variant: content}}
+    for lang in target_languages:
+        translations[lang] = {}
+        for var in variants:
+            var_dir = result_path / f"variant_{var}"
+            if var_dir.exists():
+                # Find any *_{lang}.md file
+                for f in var_dir.glob(f"*_{lang}.md"):
+                    content = load_file(f)
+                    translations[lang][var] = content
+                    if content:
+                        print(f"  Loaded: {f.relative_to(result_path)}")
+                    break
+
+    md_report = generate_md_report(config, variants, translations)
+    txt_report = generate_txt_report(config, variants, translations)
+
+    save_file(result_path / "comparison.md", md_report)
+    save_file(result_path / "comparison.txt", txt_report)
+
+    print(f"\nGenerated: comparison.md, comparison.txt")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python compare.py <results_folder>")
+        sys.exit(1)
+    compare_results(sys.argv[1])
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/translate-test-dify/example-model-comparison.md
+++ b/tools/translate-test-dify/example-model-comparison.md
@@ -0,0 +1,27 @@
+# Model Comparison Test
+
+## Background
+
+Compare translation quality between different models (e.g., Sonnet vs Opus) to evaluate improvements in accuracy, style, and punctuation handling.
+
+## keys
+
+app-***
+Model A (e.g., Sonnet original)
+
+app-***
+Model B (e.g., Opus upgraded)
+
+## test_file
+en/self-host/quick-start/docker-compose.mdx
+
+## Conclusion
+
+(Record your findings here after testing, should be filled by AI Agent)
+
+| Variant | Config | Result |
+|---------|--------|--------|
+| A | Sonnet | |
+| B | Opus | |
+
+**Recommendation**: (Your recommendation based on test results)
--- a/tools/translate-test-dify/requirements.txt
+++ b/tools/translate-test-dify/requirements.txt
@@ -0,0 +1,10 @@
+# Translation Testing Framework Dependencies
+
+# Async HTTP client for streaming API calls
+httpx>=0.25.0
+
+# Async file I/O
+aiofiles>=23.0.0
+
+# Optional: Load environment from .env file
+python-dotenv>=1.0.0
--- a/tools/translate-test-dify/run_test.py
+++ b/tools/translate-test-dify/run_test.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""
+Translation A/B Testing Runner
+Usage: python run_test.py <spec.md> [--dry-run]
+
+Test spec format:
+  ## keys
+  app-xxx
+  Description A
+
+  ## test_content
+  (inline content to translate)
+
+  OR
+
+  ## test_file
+  path/to/file.md
+"""
+
+import httpx
+import os
+import sys
+import asyncio
+import json
+import re
+import random
+from pathlib import Path
+from datetime import datetime
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+RESULTS_DIR = SCRIPT_DIR / "results"
+TERMBASE_PATH = SCRIPT_DIR.parent / "translate" / "termbase_i18n.md"
+DIFY_API_URL = "https://api.dify.ai/v1/workflows/run"
+LANGUAGE_NAMES = {"cn": "Chinese", "jp": "Japanese", "en": "English"}
+
+
+def load_env_file():
+    env_path = SCRIPT_DIR / ".env"
+    if not env_path.exists():
+        return
+    with open(env_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#") and "=" in line:
+                key, value = line.split("=", 1)
+                os.environ[key.strip()] = value.strip().strip("'\"")
+
+
+def parse_markdown_spec(file_path: Path) -> dict:
+    """Parse markdown test spec"""
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    config = {
+        "test_name": file_path.stem,
+        "target_languages": ["cn"],
+        "test_content": None,
+        "test_file": None,
+        "variants": {}
+    }
+
+    # Title
+    title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
+    if title_match:
+        config["test_name"] = re.sub(r'[^a-zA-Z0-9]+', '_', title_match.group(1)).lower().strip('_')
+
+    # Keys
+    keys_match = re.search(r'##\s*keys\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL | re.IGNORECASE)
+    if keys_match:
+        lines = [l.strip() for l in keys_match.group(1).strip().split('\n') if l.strip()]
+        variant_names = iter('ABCDEFGHIJ')
+        i = 0
+        while i < len(lines):
+            if lines[i].startswith('app-') and not lines[i].startswith('app-***'):
+                api_key = lines[i]
+                description = lines[i + 1] if i + 1 < len(lines) and not lines[i + 1].startswith('app-') else ""
+                if description:
+                    i += 1
+                config["variants"][next(variant_names)] = {"api_key": api_key, "description": description}
+            i += 1
+
+    # Target languages
+    lang_match = re.search(r'##\s*target_languages\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL | re.IGNORECASE)
+    if lang_match:
+        langs = [l.strip() for l in lang_match.group(1).strip().split('\n') if l.strip() and not l.startswith('#')]
+        if langs:
+            config["target_languages"] = langs
+
+    # Inline test content
+    content_match = re.search(r'##\s*test_content\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL | re.IGNORECASE)
+    if content_match:
+        config["test_content"] = content_match.group(1).strip()
+
+    # Test file path
+    file_match = re.search(r'##\s*test_file\s*\n(.*?)(?=\n##|\Z)', content, re.DOTALL | re.IGNORECASE)
+    if file_match:
+        config["test_file"] = file_match.group(1).strip().split('\n')[0].strip()
+
+    return config
+
+
+def is_markdown_spec(file_path: Path) -> bool:
+    if file_path.suffix in ['.md', '.mdx']:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return bool(re.search(r'##\s*keys', f.read(), re.IGNORECASE))
+    return False
+
+
+async def load_file(file_path: Path) -> str:
+    import aiofiles
+    async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
+        return await f.read()
+
+
+async def save_file(file_path: Path, content: str):
+    import aiofiles
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    async with aiofiles.open(file_path, "w", encoding="utf-8") as f:
+        await f.write(content)
+
+
+async def translate_text(content: str, api_key: str, target_language: str, termbase: str, max_retries: int = 3) -> str:
+    payload = {
+        "response_mode": "streaming",
+        "user": "TranslationTest",
+        "inputs": {
+            "original_language": "English",
+            "output_language1": target_language,
+            "the_doc": content,
+            "termbase": termbase
+        }
+    }
+    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+
+    for attempt in range(max_retries):
+        try:
+            if attempt > 0:
+                delay = min(30 * (2 ** (attempt - 1)), 300) * random.uniform(0.8, 1.2)
+                print(f"    Retry {attempt + 1}/{max_retries}...")
+                await asyncio.sleep(delay)
+
+            async with httpx.AsyncClient(timeout=600.0) as client:
+                async with client.stream("POST", DIFY_API_URL, json=payload, headers=headers) as response:
+                    if response.status_code != 200:
+                        print(f"    HTTP {response.status_code}")
+                        if response.status_code in [502, 503, 504] and attempt < max_retries - 1:
+                            continue
+                        return ""
+
+                    output1 = None
+                    async for line in response.aiter_lines():
+                        if line.strip().startswith("data: "):
+                            try:
+                                data = json.loads(line[6:])
+                                if data.get("event") == "workflow_finished":
+                                    output1 = data.get("data", {}).get("outputs", {}).get("output1", "")
+                                elif data.get("event") == "error":
+                                    print(f"    Error: {data.get('message')}")
+                                    return ""
+                            except json.JSONDecodeError:
+                                continue
+
+            if output1:
+                return output1
+            if attempt < max_retries - 1:
+                continue
+            return ""
+
+        except (httpx.ReadTimeout, httpx.ConnectTimeout, httpx.HTTPError) as e:
+            print(f"    {type(e).__name__}")
+            if attempt >= max_retries - 1:
+                return ""
+
+    return ""
+
+
+async def run_test(config_path: str, dry_run: bool = False):
+    load_env_file()
+
+    config_file = Path(config_path)
+    if not config_file.exists():
+        print(f"Error: {config_path} not found")
+        sys.exit(1)
+
+    if not is_markdown_spec(config_file):
+        print(f"Error: Not a valid test spec (needs ## keys section)")
+        sys.exit(1)
+
+    print(f"Parsing: {config_path}")
+    config = parse_markdown_spec(config_file)
+
+    test_name = config.get("test_name", "test")
+    target_languages = config.get("target_languages", ["cn"])
+    variants = config.get("variants", {})
+    test_content = config.get("test_content")
+    test_file = config.get("test_file")
+
+    if not variants:
+        print("Error: No valid API keys found")
+        sys.exit(1)
+
+    # Get test content
+    if test_content:
+        doc_content = test_content
+        doc_name = "inline"
+    elif test_file:
+        test_file_path = Path(test_file)
+        if not test_file_path.is_absolute():
+            test_file_path = SCRIPT_DIR.parent.parent / test_file
+        if not test_file_path.exists():
+            print(f"Error: Test file not found: {test_file}")
+            sys.exit(1)
+        doc_content = await load_file(test_file_path)
+        doc_name = test_file_path.stem
+    else:
+        print("Error: Need ## test_content or ## test_file section")
+        sys.exit(1)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    result_dir = RESULTS_DIR / f"{timestamp}_{test_name}"
+
+    if not dry_run:
+        result_dir.mkdir(parents=True, exist_ok=True)
+        with open(result_dir / "config.json", "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2, ensure_ascii=False)
+        # Save source content
+        with open(result_dir / f"source_{doc_name}.md", "w", encoding="utf-8") as f:
+            f.write(doc_content)
+
+    termbase = await load_file(TERMBASE_PATH) if TERMBASE_PATH.exists() else ""
+
+    print(f"\n{'='*50}")
+    print(f"Test: {test_name}")
+    print(f"Variants: {', '.join(variants.keys())}")
+    print(f"Languages: {', '.join(target_languages)}")
+    print(f"Content: {doc_name} ({len(doc_content)} chars)")
+    print(f"{'='*50}\n")
+
+    all_results = {}
+    for var_name, var_config in variants.items():
+        api_key = var_config.get("api_key")
+        desc = var_config.get("description", "")
+        print(f"{var_name}: {desc}")
+
+        if dry_run:
+            print("  DRY RUN")
+            continue
+
+        var_dir = result_dir / f"variant_{var_name}"
+        var_dir.mkdir(parents=True, exist_ok=True)
+
+        results = {}
+        for lang in target_languages:
+            lang_name = LANGUAGE_NAMES.get(lang, lang)
+            print(f"  → {lang_name}...", end=" ", flush=True)
+
+            translated = await translate_text(doc_content, api_key, lang_name, termbase)
+            if translated:
+                out_file = var_dir / f"{doc_name}_{lang}.md"
+                await save_file(out_file, translated)
+                print(f"OK ({len(translated)} chars)")
+                results[lang] = {"status": "ok", "chars": len(translated)}
+            else:
+                print("FAIL")
+                results[lang] = {"status": "fail"}
+
+        all_results[var_name] = results
+
+    if not dry_run:
+        with open(result_dir / "results.json", "w", encoding="utf-8") as f:
+            json.dump(all_results, f, indent=2, ensure_ascii=False)
+        print(f"\nResults: {result_dir}")
+        print(f"Run: python compare.py {result_dir}")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python run_test.py <spec.md> [--dry-run]")
+        sys.exit(1)
+    asyncio.run(run_test(sys.argv[1], "--dry-run" in sys.argv))
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/translate-test-dify/setup.sh
+++ b/tools/translate-test-dify/setup.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Translation Test Framework Setup Script
+# Creates Python virtual environment and installs dependencies
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+echo "==================================="
+echo "Translation Test Framework Setup"
+echo "==================================="
+echo ""
+
+# Check Python version
+if ! command -v python3 &> /dev/null; then
+    echo "Error: Python 3 is required but not found"
+    exit 1
+fi
+
+PYTHON_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+echo "Python version: $PYTHON_VERSION"
+
+# Create virtual environment
+if [ -d "venv" ]; then
+    echo "Virtual environment already exists"
+else
+    echo "Creating virtual environment..."
+    python3 -m venv venv
+    echo "Virtual environment created"
+fi
+
+# Activate and install dependencies
+echo "Installing dependencies..."
+source venv/bin/activate
+pip install --upgrade pip -q
+pip install -r requirements.txt -q
+
+echo ""
+echo "==================================="
+echo "Setup Complete!"
+echo "==================================="
+echo ""
+echo "To activate the environment:"
+echo "  source venv/bin/activate"
+echo ""
+echo "To run a test:"
+echo "  python run_test.py <test-spec.md>"
+echo ""
+echo "Example:"
+echo "  python run_test.py base-20251127.md"
+echo ""