✨ feat(eval): add external scoring mode (#12729)

* wip: add llm relevant & BrowseComp * wip: add widesearch desc * wip: dsqa, hle, widesearch * wip: add dsqa * wip: add awaiting eval status for runs * wip: add awaiting status for run * wip: adjust hle-verified * 🐛 fix: browsecomp topics * 📝 docs: add annotations * wip: add awaiting status for pass@k * wip: add complete status * wip: update theard dots * wip: update run status page * wip: remove useless impl * wip: update prompt * ✨ feat: add external eval routes * wip: add eval cli * 🐛 fix: support authoritize in no browser environment * wip: pass tests * ♻️ refactor: remove tests * ♻️ refactor: mo camel case
2026-03-26 13:19:34 +07:00 · 2026-03-10 09:53:26 +08:00
parent 255a1c21a8
commit ea329113be
34 changed files with 1655 additions and 40 deletions
--- a/locales/en-US/eval.json
+++ b/locales/en-US/eval.json
@@ -157,13 +157,15 @@
  "difficulty.easy": "Easy",
  "difficulty.hard": "Hard",
  "difficulty.medium": "Medium",
+  "evalMode.answer-relevance": "LLM Relevance",
+  "evalMode.answer-relevance.desc": "Use LLM to evaluate answer relevance (yes or no)",
  "evalMode.contains": "Contains Match",
  "evalMode.contains.desc": "Output must contain the expected text",
  "evalMode.equals": "Exact Match",
  "evalMode.equals.desc": "Output must be exactly the same as expected",
  "evalMode.label": "Eval Mode",
  "evalMode.llm-rubric": "LLM Judge",
-  "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality",
+  "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)",
  "evalMode.placeholder": "Select eval mode",
  "evalMode.prompt.label": "Judge Prompt",
  "evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge",
@@ -256,12 +258,16 @@
  "run.running.hint": "Evaluation is running, results will appear shortly...",
  "run.status.aborted": "Aborted",
  "run.status.completed": "Completed",
+  "run.status.completed.tooltip": "This evaluation has completed running all test cases and scoring.",
  "run.status.error": "Run Error",
+  "run.status.external": "External",
+  "run.status.external.tooltip": "This evaluation is waiting for external scoring. Results will be updated when scoring is complete.",
  "run.status.failed": "Failed",
  "run.status.idle": "Idle",
  "run.status.pending": "Pending",
  "run.status.running": "Running",
  "run.status.timeout": "Timeout",
+  "sidebar": "Evaluation",
  "sidebar.benchmarks": "Benchmarks",
  "sidebar.dashboard": "Dashboard",
  "sidebar.datasets": "Datasets",
--- a/locales/zh-CN/eval.json
+++ b/locales/zh-CN/eval.json
@@ -161,6 +161,8 @@
  "evalMode.contains.desc": "输出中必须包含期望的文本",
  "evalMode.equals": "精确匹配",
  "evalMode.equals.desc": "输出必须与期望内容完全一致",
+  "evalMode.external": "外部评估",
+  "evalMode.external.desc": "智能体完成运行后，由外部系统提交评估结果",
  "evalMode.label": "评估模式",
  "evalMode.llm-rubric": "LLM 评判",
  "evalMode.llm-rubric.desc": "使用 LLM 评估输出质量",
@@ -256,7 +258,10 @@
  "run.running.hint": "评测进行中，结果即将呈现...",
  "run.status.aborted": "已终止",
  "run.status.completed": "已完成",
+  "run.status.completed.tooltip": "评测已完成运行，所有结果已评估。",
  "run.status.error": "运行出错",
+  "run.status.external": "待外部评测",
+  "run.status.external.tooltip": "智能体已完成运行，等待外部系统提交评估结果。",
  "run.status.failed": "失败",
  "run.status.idle": "待开始",
  "run.status.pending": "等待中",