{
  "benchmarks": [
    {
      "id": "arena_elo",
      "name": "Chatbot Arena Elo",
      "short": "Arena",
      "description": "Crowdsourced human-preference Elo rating from LMSYS Chatbot Arena. Higher is better.",
      "scale": "elo",
      "higherIsBetter": true,
      "weight": 1.4
    },
    {
      "id": "mmlu_pro",
      "name": "MMLU-Pro",
      "short": "MMLU-Pro",
      "description": "Massive Multitask Language Understanding (Pro variant) — broad academic knowledge across 57 subjects.",
      "scale": "percent",
      "higherIsBetter": true,
      "weight": 1.0
    },
    {
      "id": "gpqa",
      "name": "GPQA Diamond",
      "short": "GPQA",
      "description": "Graduate-level Google-proof Q&A in physics, chemistry, biology — measures expert reasoning.",
      "scale": "percent",
      "higherIsBetter": true,
      "weight": 1.1
    },
    {
      "id": "math",
      "name": "MATH",
      "short": "MATH",
      "description": "Competition-level mathematics problem solving (Hendrycks et al.).",
      "scale": "percent",
      "higherIsBetter": true,
      "weight": 0.9
    },
    {
      "id": "humaneval",
      "name": "HumanEval",
      "short": "HumanEval",
      "description": "Python coding correctness on 164 hand-written programming problems.",
      "scale": "percent",
      "higherIsBetter": true,
      "weight": 0.8
    },
    {
      "id": "swe_bench",
      "name": "SWE-Bench Verified",
      "short": "SWE-Bench",
      "description": "Real-world GitHub issue resolution against verified test cases — agentic coding ability.",
      "scale": "percent",
      "higherIsBetter": true,
      "weight": 1.2
    }
  ]
}
