mirror of
https://github.com/lobehub/lobehub.git
synced 2026-03-26 13:19:34 +07:00
✨ feat(eval): add external scoring mode (#12729)
* wip: add llm relevant & BrowseComp * wip: add widesearch desc * wip: dsqa, hle, widesearch * wip: add dsqa * wip: add awaiting eval status for runs * wip: add awaiting status for run * wip: adjust hle-verified * 🐛 fix: browsecomp topics * 📝 docs: add annotations * wip: add awaiting status for pass@k * wip: add complete status * wip: update theard dots * wip: update run status page * wip: remove useless impl * wip: update prompt * ✨ feat: add external eval routes * wip: add eval cli * 🐛 fix: support authoritize in no browser environment * wip: pass tests * ♻️ refactor: remove tests * ♻️ refactor: mo camel case
This commit is contained in:
@@ -157,13 +157,15 @@
|
||||
"difficulty.easy": "Easy",
|
||||
"difficulty.hard": "Hard",
|
||||
"difficulty.medium": "Medium",
|
||||
"evalMode.answer-relevance": "LLM Relevance",
|
||||
"evalMode.answer-relevance.desc": "Use LLM to evaluate answer relevance (yes or no)",
|
||||
"evalMode.contains": "Contains Match",
|
||||
"evalMode.contains.desc": "Output must contain the expected text",
|
||||
"evalMode.equals": "Exact Match",
|
||||
"evalMode.equals.desc": "Output must be exactly the same as expected",
|
||||
"evalMode.label": "Eval Mode",
|
||||
"evalMode.llm-rubric": "LLM Judge",
|
||||
"evalMode.llm-rubric.desc": "Use LLM to evaluate output quality",
|
||||
"evalMode.llm-rubric.desc": "Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)",
|
||||
"evalMode.placeholder": "Select eval mode",
|
||||
"evalMode.prompt.label": "Judge Prompt",
|
||||
"evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge",
|
||||
@@ -256,12 +258,16 @@
|
||||
"run.running.hint": "Evaluation is running, results will appear shortly...",
|
||||
"run.status.aborted": "Aborted",
|
||||
"run.status.completed": "Completed",
|
||||
"run.status.completed.tooltip": "This evaluation has completed running all test cases and scoring.",
|
||||
"run.status.error": "Run Error",
|
||||
"run.status.external": "External",
|
||||
"run.status.external.tooltip": "This evaluation is waiting for external scoring. Results will be updated when scoring is complete.",
|
||||
"run.status.failed": "Failed",
|
||||
"run.status.idle": "Idle",
|
||||
"run.status.pending": "Pending",
|
||||
"run.status.running": "Running",
|
||||
"run.status.timeout": "Timeout",
|
||||
"sidebar": "Evaluation",
|
||||
"sidebar.benchmarks": "Benchmarks",
|
||||
"sidebar.dashboard": "Dashboard",
|
||||
"sidebar.datasets": "Datasets",
|
||||
|
||||
@@ -161,6 +161,8 @@
|
||||
"evalMode.contains.desc": "输出中必须包含期望的文本",
|
||||
"evalMode.equals": "精确匹配",
|
||||
"evalMode.equals.desc": "输出必须与期望内容完全一致",
|
||||
"evalMode.external": "外部评估",
|
||||
"evalMode.external.desc": "智能体完成运行后,由外部系统提交评估结果",
|
||||
"evalMode.label": "评估模式",
|
||||
"evalMode.llm-rubric": "LLM 评判",
|
||||
"evalMode.llm-rubric.desc": "使用 LLM 评估输出质量",
|
||||
@@ -256,7 +258,10 @@
|
||||
"run.running.hint": "评测进行中,结果即将呈现...",
|
||||
"run.status.aborted": "已终止",
|
||||
"run.status.completed": "已完成",
|
||||
"run.status.completed.tooltip": "评测已完成运行,所有结果已评估。",
|
||||
"run.status.error": "运行出错",
|
||||
"run.status.external": "待外部评测",
|
||||
"run.status.external.tooltip": "智能体已完成运行,等待外部系统提交评估结果。",
|
||||
"run.status.failed": "失败",
|
||||
"run.status.idle": "待开始",
|
||||
"run.status.pending": "等待中",
|
||||
|
||||
Reference in New Issue
Block a user