feat(eval): add external scoring mode (#12729)

* wip: add llm relevant & BrowseComp

* wip: add widesearch desc

* wip: dsqa, hle, widesearch

* wip: add dsqa

* wip: add awaiting eval status for runs

* wip: add awaiting status for run

* wip: adjust hle-verified

* 🐛 fix: browsecomp topics

* 📝 docs: add annotations

* wip: add awaiting status for pass@k

* wip: add complete status

* wip: update theard dots

* wip: update run status page

* wip: remove useless impl

* wip: update prompt

*  feat: add external eval routes

* wip: add eval cli

* 🐛 fix: support authoritize in no browser environment

* wip: pass tests

* ♻️ refactor: remove tests

* ♻️ refactor: mo camel case
This commit is contained in:
Rylan Cai
2026-03-10 09:53:26 +08:00
committed by GitHub
parent 255a1c21a8
commit ea329113be
34 changed files with 1655 additions and 40 deletions

View File

@@ -157,13 +157,15 @@
"difficulty.easy": "Easy",
"difficulty.hard": "Hard",
"difficulty.medium": "Medium",
"evalMode.answer-relevance": "LLM Relevance",
"evalMode.answer-relevance.desc": "Use LLM to evaluate answer relevance (yes or no)",
"evalMode.contains": "Contains Match",
"evalMode.contains.desc": "Output must contain the expected text",
"evalMode.equals": "Exact Match",
"evalMode.equals.desc": "Output must be exactly the same as expected",
"evalMode.label": "Eval Mode",
"evalMode.llm-rubric": "LLM Judge",
"evalMode.llm-rubric.desc": "Use LLM to evaluate output quality",
"evalMode.llm-rubric.desc": "Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)",
"evalMode.placeholder": "Select eval mode",
"evalMode.prompt.label": "Judge Prompt",
"evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge",
@@ -256,12 +258,16 @@
"run.running.hint": "Evaluation is running, results will appear shortly...",
"run.status.aborted": "Aborted",
"run.status.completed": "Completed",
"run.status.completed.tooltip": "This evaluation has completed running all test cases and scoring.",
"run.status.error": "Run Error",
"run.status.external": "External",
"run.status.external.tooltip": "This evaluation is waiting for external scoring. Results will be updated when scoring is complete.",
"run.status.failed": "Failed",
"run.status.idle": "Idle",
"run.status.pending": "Pending",
"run.status.running": "Running",
"run.status.timeout": "Timeout",
"sidebar": "Evaluation",
"sidebar.benchmarks": "Benchmarks",
"sidebar.dashboard": "Dashboard",
"sidebar.datasets": "Datasets",

View File

@@ -161,6 +161,8 @@
"evalMode.contains.desc": "输出中必须包含期望的文本",
"evalMode.equals": "精确匹配",
"evalMode.equals.desc": "输出必须与期望内容完全一致",
"evalMode.external": "外部评估",
"evalMode.external.desc": "智能体完成运行后,由外部系统提交评估结果",
"evalMode.label": "评估模式",
"evalMode.llm-rubric": "LLM 评判",
"evalMode.llm-rubric.desc": "使用 LLM 评估输出质量",
@@ -256,7 +258,10 @@
"run.running.hint": "评测进行中,结果即将呈现...",
"run.status.aborted": "已终止",
"run.status.completed": "已完成",
"run.status.completed.tooltip": "评测已完成运行,所有结果已评估。",
"run.status.error": "运行出错",
"run.status.external": "待外部评测",
"run.status.external.tooltip": "智能体已完成运行,等待外部系统提交评估结果。",
"run.status.failed": "失败",
"run.status.idle": "待开始",
"run.status.pending": "等待中",