From 22dca91a618859f01e0c670ba4398c1804dcfdc9 Mon Sep 17 00:00:00 2001
From: Marco Walz <marco.walz@dfinity.org>
Date: Wed, 4 Mar 2026 18:40:48 +0100
Subject: [PATCH 1/3] feat: add skill evaluation harness with LLM-as-judge
 scoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a lightweight eval framework that tests skill effectiveness by
comparing agent output with and without the skill loaded. Uses the
`claude` CLI for both agent runs and judging — no external API keys
or infrastructure needed.

- scripts/run-evals.js: eval runner (with/without skill + judge)
- skills/icp-cli/evals.json: 3 output evals + 20 trigger evals
- Validator now warns if a skill is missing evals.json
- Updated CONTRIBUTING.md, CLAUDE.md, README.md with eval guidance
---
 .claude/CLAUDE.md          |  10 ++
 .gitignore                 |   2 +
 CONTRIBUTING.md            |  25 ++++-
 README.md                  |   1 +
 scripts/run-evals.js       | 219 +++++++++++++++++++++++++++++++++++++
 scripts/validate-skills.js |   7 +-
 skills/icp-cli/evals.json  |  70 ++++++++++++
 7 files changed, 331 insertions(+), 3 deletions(-)
 create mode 100644 scripts/run-evals.js
 create mode 100644 skills/icp-cli/evals.json
diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
index 1cea5ea..c2840bc 100644
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -39,6 +39,16 @@ npm run validate     # Fix all errors before committing. Warnings are acceptable
 ```
 Validate runs in CI and blocks deployment on errors.
 
+## Evaluations
+
+Each skill should have an `evals.json` file with test cases. Run evaluations with:
+```bash
+node scripts/run-evals.js <skill-name>              # All evals
+node scripts/run-evals.js <skill-name> --eval "X"   # Single eval by name
+node scripts/run-evals.js <skill-name> --no-baseline # Skip without-skill baseline
+```
+Results are saved to `skills/<skill-name>/eval-results/` (gitignored). See `skills/icp-cli/evals.json` for the format.
+
 ## Writing Guidelines
 
 - **Write for agents, not humans.** Be explicit with canister IDs, function signatures, and error messages.
diff --git a/.gitignore b/.gitignore
index ce9ba4b..7ac2ab2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ public/llms.txt
 public/llms-full.txt
 .astro
 lighthouse-*
+.eval-tmp
+skills/*/eval-results/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5de7caf..7de05ec 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -121,13 +121,34 @@ npm run validate     # Check frontmatter and sections
 
 This runs automatically in CI and blocks deployment on errors.
 
-### 4. That's it — the website auto-discovers skills
+### 4. Add evaluation cases
+
+Create `skills/<skill-name>/evals.json` with test cases that verify the skill works. The eval file has two sections:
+
+- **`output_evals`** — realistic prompts with expected behaviors a judge can check
+- **`trigger_evals`** — queries that should/shouldn't activate the skill
+
+See `skills/icp-cli/evals.json` for a working example. Write prompts the way a developer would actually ask — vague and incomplete, not over-specified test questions.
+
+**Running evaluations** (optional, requires `claude` CLI):
+
+```bash
+node scripts/run-evals.js <skill-name>                    # All evals, with + without skill
+node scripts/run-evals.js <skill-name> --eval "name"      # Single eval
+node scripts/run-evals.js <skill-name> --no-baseline       # Skip without-skill run
+```
+
+This sends each prompt to Claude with and without the skill, then has a judge score the output. Results are saved to `skills/<skill-name>/eval-results/` (gitignored).
+
+Including a summary of eval results in your PR description is recommended but not required — running evals needs `claude` CLI access and costs API credits.
+
+### 5. That's it — the website auto-discovers skills
 
 The website is automatically generated from the SKILL.md frontmatter at build time. You do **not** need to edit any source file. Astro reads all `skills/*/SKILL.md` files, parses their frontmatter, and generates the site pages, `llms.txt`, `agent.json`, and other discovery files.
 
 Stats (skill count, categories) all update automatically.
 
-### 5. Submit a PR
+### 6. Submit a PR
 
 - One skill per PR
 - Include a brief description of what the skill covers and why it's needed
diff --git a/README.md b/README.md
index 9b957d7..611fb75 100644
--- a/README.md
+++ b/README.md
@@ -86,6 +86,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for how to add or update skills.
 - **Hosting**: GitHub Pages via Actions
 - **Skills**: Plain markdown files in `skills/*/SKILL.md`
 - **Validation**: Structural linter for frontmatter and code blocks (`npm run validate`)
+- **Evaluation**: Per-skill eval cases with LLM-as-judge scoring (`node scripts/run-evals.js <skill>`)
 - **Schema**: JSON Schema for frontmatter at `skills/skill.schema.json`
 - **SEO**: Per-skill meta tags, JSON-LD (TechArticle), sitemap, canonical URLs
 - **AI Agent Discovery**: `llms.txt`, `llms-full.txt`, `.well-known/agent.json`, per-skill `.md` endpoints
diff --git a/scripts/run-evals.js b/scripts/run-evals.js
new file mode 100644
index 0000000..a5120f6
--- /dev/null
+++ b/scripts/run-evals.js
@@ -0,0 +1,219 @@
+#!/usr/bin/env node
+
+/**
+ * Skill evaluation runner.
+ *
+ * Runs output_evals from a skill's evals.json by sending the prompt to the
+ * `claude` CLI — once WITH the skill as context, once WITHOUT — then asks a
+ * judge model to score each expected behavior as pass/fail.
+ *
+ * Usage:
+ *   node scripts/run-evals.js <skill-name> [--eval <name>] [--no-baseline]
+ *
+ * Examples:
+ *   node scripts/run-evals.js icp-cli
+ *   node scripts/run-evals.js icp-cli --eval "Deploy to mainnet"
+ *   node scripts/run-evals.js icp-cli --no-baseline   # skip without-skill run
+ *
+ * Requirements:
+ *   - `claude` CLI installed and authenticated
+ */
+
+import { readFileSync, writeFileSync, mkdirSync } from "fs";
+import { execSync } from "child_process";
+import { join } from "path";
+
+const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, "");
+
+// ---------------------------------------------------------------------------
+// CLI args
+// ---------------------------------------------------------------------------
+const args = process.argv.slice(2);
+const skillName = args.find((a) => !a.startsWith("--"));
+if (!skillName) {
+  console.error("Usage: node scripts/run-evals.js <skill-name> [--eval <name>] [--no-baseline]");
+  process.exit(1);
+}
+
+const evalFilterIdx = args.indexOf("--eval");
+const evalFilter = evalFilterIdx !== -1 ? args[evalFilterIdx + 1] : null;
+const skipBaseline = args.includes("--no-baseline");
+
+// ---------------------------------------------------------------------------
+// Load skill + evals
+// ---------------------------------------------------------------------------
+const skillDir = join(ROOT, "skills", skillName);
+const skillContent = readFileSync(join(skillDir, "SKILL.md"), "utf-8");
+const evals = JSON.parse(readFileSync(join(skillDir, "evals.json"), "utf-8"));
+
+let cases = evals.output_evals;
+if (evalFilter) {
+  cases = cases.filter((c) => c.name.toLowerCase().includes(evalFilter.toLowerCase()));
+  if (cases.length === 0) {
+    console.error(`No eval case matching "${evalFilter}"`);
+    process.exit(1);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/** Run a prompt through claude CLI and return the output text. */
+function runClaude(prompt, systemPrompt) {
+  const tmpDir = join(ROOT, ".eval-tmp");
+  mkdirSync(tmpDir, { recursive: true });
+
+  // Write prompt to temp file to avoid all shell escaping issues
+  const promptFile = join(tmpDir, "prompt.txt");
+  writeFileSync(promptFile, prompt);
+
+  let cmd = `cat '${promptFile}' | claude -p --model sonnet`;
+  if (systemPrompt) {
+    const systemFile = join(tmpDir, "system-prompt.txt");
+    writeFileSync(systemFile, systemPrompt);
+    cmd += ` --system-prompt "$(cat '${systemFile}')"`;
+  }
+
+  // Run from /tmp to prevent claude from picking up repo context
+  try {
+    return execSync(cmd, {
+      encoding: "utf-8",
+      maxBuffer: 1024 * 1024,
+      timeout: 120_000,
+      cwd: "/tmp",
+    }).trim();
+  } catch (e) {
+    return `[ERROR] ${e.message}`;
+  }
+}
+
+/** Ask claude to judge an output against expected behaviors. */
+function judge(evalCase, output, label) {
+  const behaviors = evalCase.expected_behaviors
+    .map((b, i) => `${i + 1}. ${b}`)
+    .join("\n");
+
+  const judgePrompt = `You are an evaluation judge. A coding assistant was given this task:
+
+<task>
+${evalCase.prompt}
+</task>
+
+The assistant produced this output:
+
+<output>
+${output}
+</output>
+
+Score each expected behavior as PASS or FAIL. Be strict — the behavior must be clearly present, not just vaguely implied. Return ONLY a JSON array of objects with "behavior", "pass" (boolean), and "reason" (one sentence).
+
+Expected behaviors:
+${behaviors}`;
+
+  const raw = runClaude(judgePrompt, null);
+
+  // Extract JSON from the response
+  const jsonMatch = raw.match(/\[[\s\S]*\]/);
+  if (!jsonMatch) {
+    console.error(`  [${label}] Judge returned non-JSON:\n${raw}\n`);
+    return null;
+  }
+  try {
+    return JSON.parse(jsonMatch[0]);
+  } catch {
+    console.error(`  [${label}] Failed to parse judge JSON:\n${jsonMatch[0]}\n`);
+    return null;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Run
+// ---------------------------------------------------------------------------
+console.log(`\nEvaluating skill: ${skillName}`);
+console.log(`Cases: ${cases.map((c) => c.name).join(", ")}\n`);
+
+const results = [];
+
+for (const evalCase of cases) {
+  console.log(`━━━ ${evalCase.name} ━━━\n`);
+
+  // Run WITH skill
+  console.log("  Running WITH skill...");
+  const withOutput = runClaude(evalCase.prompt, skillContent);
+
+  // Run WITHOUT skill (baseline)
+  let withoutOutput = null;
+  if (!skipBaseline) {
+    console.log("  Running WITHOUT skill...");
+    withoutOutput = runClaude(evalCase.prompt, null);
+  }
+
+  // Judge
+  console.log("  Judging WITH skill...");
+  const withJudgment = judge(evalCase, withOutput, "with-skill");
+
+  let withoutJudgment = null;
+  if (withoutOutput) {
+    console.log("  Judging WITHOUT skill...");
+    withoutJudgment = judge(evalCase, withoutOutput, "without-skill");
+  }
+
+  // Print results
+  if (withJudgment) {
+    const passed = withJudgment.filter((j) => j.pass).length;
+    const total = withJudgment.length;
+    console.log(`\n  WITH skill: ${passed}/${total} passed`);
+    for (const j of withJudgment) {
+      console.log(`    ${j.pass ? "✅" : "❌"} ${j.behavior}`);
+      if (!j.pass) console.log(`       → ${j.reason}`);
+    }
+  }
+
+  if (withoutJudgment) {
+    const passed = withoutJudgment.filter((j) => j.pass).length;
+    const total = withoutJudgment.length;
+    console.log(`\n  WITHOUT skill: ${passed}/${total} passed`);
+    for (const j of withoutJudgment) {
+      console.log(`    ${j.pass ? "✅" : "❌"} ${j.behavior}`);
+      if (!j.pass) console.log(`       → ${j.reason}`);
+    }
+  }
+
+  results.push({
+    name: evalCase.name,
+    with_skill: { output: withOutput, judgment: withJudgment },
+    without_skill: withoutOutput
+      ? { output: withoutOutput, judgment: withoutJudgment }
+      : null,
+  });
+
+  console.log("");
+}
+
+// ---------------------------------------------------------------------------
+// Summary + save
+// ---------------------------------------------------------------------------
+console.log("━━━ Summary ━━━\n");
+for (const r of results) {
+  const withScore = r.with_skill.judgment
+    ? `${r.with_skill.judgment.filter((j) => j.pass).length}/${r.with_skill.judgment.length}`
+    : "error";
+  const withoutScore = r.without_skill?.judgment
+    ? `${r.without_skill.judgment.filter((j) => j.pass).length}/${r.without_skill.judgment.length}`
+    : "skipped";
+  console.log(`  ${r.name}: WITH ${withScore} | WITHOUT ${withoutScore}`);
+}
+
+// Save full results
+const outDir = join(ROOT, "skills", skillName, "eval-results");
+mkdirSync(outDir, { recursive: true });
+const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
+const outFile = join(outDir, `run-${timestamp}.json`);
+writeFileSync(outFile, JSON.stringify(results, null, 2));
+console.log(`\nFull results saved to: ${outFile}\n`);
+
+// Cleanup
+try {
+  execSync(`rm -rf '${join(ROOT, ".eval-tmp")}'`);
+} catch {}
diff --git a/scripts/validate-skills.js b/scripts/validate-skills.js
index 51a7e78..9cee0d1 100644
--- a/scripts/validate-skills.js
+++ b/scripts/validate-skills.js
@@ -3,7 +3,7 @@
 // Checks frontmatter fields, required sections, and code block annotations.
 // Run: node scripts/validate-skills.js
 
-import { readFileSync } from "fs";
+import { readFileSync, existsSync } from "fs";
 import { join } from "path";
 import { readAllSkills, SKILLS_DIR } from "./lib/parse-skill.js";
 
@@ -130,6 +130,11 @@ for (const skill of skills) {
   if (!meta.compatibility) {
     warn(label, `missing "compatibility" field in frontmatter`);
   }
+
+  // --- Evals validation ---
+  if (!existsSync(join(SKILLS_DIR, dir, "evals.json"))) {
+    warn(label, `missing evals.json — see CONTRIBUTING.md for evaluation guidance`);
+  }
 }
 
 // --- Output ---
diff --git a/skills/icp-cli/evals.json b/skills/icp-cli/evals.json
new file mode 100644
index 0000000..024e274
--- /dev/null
+++ b/skills/icp-cli/evals.json
@@ -0,0 +1,70 @@
+{
+  "skill": "icp-cli",
+  "description": "Evaluation cases for the icp-cli skill. Tests whether agents produce correct icp-cli commands and configuration instead of legacy dfx equivalents.",
+
+  "output_evals": [
+    {
+      "name": "New project setup",
+      "prompt": "I want to build a dapp on ICP with a Rust backend and a React frontend. How do I set this up?",
+      "expected_behaviors": [
+        "Uses icp (not dfx) commands throughout",
+        "Configuration file is icp.yaml, NOT dfx.json",
+        "Canisters are a YAML array of objects (- name: ...), NOT a keyed map",
+        "Rust canister uses a recipe with a version pin (e.g., @dfinity/rust@v3.2.0)",
+        "Frontend/asset canister uses a recipe with a version pin",
+        "Asset canister recipe includes explicit build commands",
+        "Shows how to start the local network (icp network start -d)"
+      ]
+    },
+    {
+      "name": "Deploy to mainnet",
+      "prompt": "My canisters work locally, how do I get them on mainnet?",
+      "expected_behaviors": [
+        "Uses 'icp deploy -e ic', NOT 'dfx deploy --network ic' or '--network ic'",
+        "Mentions cycles are needed",
+        "Mentions canister IDs are stored in .icp/data/ and should be committed to git",
+        "Does NOT use --network ic flag for deployment"
+      ]
+    },
+    {
+      "name": "Migrate from dfx",
+      "prompt": "I have an older IC project that still uses dfx and dfx.json. It has a Motoko backend and a frontend. I want to switch to the new CLI. I also have canisters running on mainnet already.",
+      "expected_behaviors": [
+        "Creates icp.yaml with recipe-based canister configuration",
+        "Motoko canister uses @dfinity/motoko recipe with a version pin",
+        "Asset canister uses @dfinity/asset-canister recipe with a version pin",
+        "Explains identity migration (export from dfx, import into icp)",
+        "Explains canister ID migration via .icp/data/mappings/ic.ids.json",
+        "Uses correct icp identity commands ('icp identity default' not 'icp identity use')"
+      ]
+    }
+  ],
+
+  "trigger_evals": {
+    "description": "Queries to test whether the skill activates correctly. 'should_trigger' queries should cause the skill to load; 'should_not_trigger' queries should NOT activate this skill.",
+    "should_trigger": [
+      "Set up a new Internet Computer project with Rust",
+      "How do I deploy my canister to the local network?",
+      "What's the icp.yaml config for a Motoko canister?",
+      "I'm getting an error with dfx deploy, can you help?",
+      "How do I start the local replica?",
+      "Migrate my dfx.json project to the new CLI",
+      "How do I create a new identity for mainnet deployment?",
+      "What recipes are available for icp-cli?",
+      "My icp deploy is failing with a build error",
+      "How do I check my canister status on mainnet?"
+    ],
+    "should_not_trigger": [
+      "Add access control to my Motoko canister",
+      "How does stable memory work in Rust canisters?",
+      "Implement ICRC-1 token transfer in my canister",
+      "Write a unit test for my Motoko actor",
+      "Set up inter-canister calls between two canisters",
+      "How do I use certified variables?",
+      "Explain the IC consensus mechanism",
+      "Add Internet Identity login to my frontend",
+      "How do I handle canister upgrades safely?",
+      "What's the best way to store large data on-chain?"
+    ]
+  }
+}

From a035f91363f19a353d8923be52c361cb360d46e9 Mon Sep 17 00:00:00 2001
From: Marco Walz <marco.walz@dfinity.org>
Date: Wed, 4 Mar 2026 18:53:27 +0100
Subject: [PATCH 2/3] feat: add trigger eval support to the eval runner

Presents all skill descriptions as a catalog to a judge, then checks
whether each query correctly selects (or avoids) the target skill.
Batches all queries into a single judge call for efficiency.

Usage: node scripts/run-evals.js icp-cli --triggers-only
---
 scripts/run-evals.js | 250 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 185 insertions(+), 65 deletions(-)

diff --git a/scripts/run-evals.js b/scripts/run-evals.js
index a5120f6..cb44c33 100644
--- a/scripts/run-evals.js
+++ b/scripts/run-evals.js
@@ -3,17 +3,20 @@
 /**
  * Skill evaluation runner.
  *
- * Runs output_evals from a skill's evals.json by sending the prompt to the
- * `claude` CLI — once WITH the skill as context, once WITHOUT — then asks a
- * judge model to score each expected behavior as pass/fail.
+ * Runs output_evals and trigger_evals from a skill's evals.json.
+ * - Output evals: sends prompts to `claude` CLI with/without the skill,
+ *   then uses a judge to score expected behaviors as pass/fail.
+ * - Trigger evals: presents all skill descriptions to a judge and checks
+ *   whether each query would correctly trigger (or not trigger) the skill.
  *
  * Usage:
- *   node scripts/run-evals.js <skill-name> [--eval <name>] [--no-baseline]
+ *   node scripts/run-evals.js <skill-name> [--eval <name>] [--no-baseline] [--triggers-only]
  *
  * Examples:
  *   node scripts/run-evals.js icp-cli
  *   node scripts/run-evals.js icp-cli --eval "Deploy to mainnet"
- *   node scripts/run-evals.js icp-cli --no-baseline   # skip without-skill run
+ *   node scripts/run-evals.js icp-cli --no-baseline
+ *   node scripts/run-evals.js icp-cli --triggers-only
  *
  * Requirements:
  *   - `claude` CLI installed and authenticated
@@ -22,6 +25,7 @@
 import { readFileSync, writeFileSync, mkdirSync } from "fs";
 import { execSync } from "child_process";
 import { join } from "path";
+import { readAllSkills } from "./lib/parse-skill.js";
 
 const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, "");
 
@@ -31,13 +35,14 @@ const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, "");
 const args = process.argv.slice(2);
 const skillName = args.find((a) => !a.startsWith("--"));
 if (!skillName) {
-  console.error("Usage: node scripts/run-evals.js <skill-name> [--eval <name>] [--no-baseline]");
+  console.error("Usage: node scripts/run-evals.js <skill-name> [--eval <name>] [--no-baseline] [--triggers-only]");
   process.exit(1);
 }
 
 const evalFilterIdx = args.indexOf("--eval");
 const evalFilter = evalFilterIdx !== -1 ? args[evalFilterIdx + 1] : null;
 const skipBaseline = args.includes("--no-baseline");
+const triggersOnly = args.includes("--triggers-only");
 
 // ---------------------------------------------------------------------------
 // Load skill + evals
@@ -46,10 +51,10 @@ const skillDir = join(ROOT, "skills", skillName);
 const skillContent = readFileSync(join(skillDir, "SKILL.md"), "utf-8");
 const evals = JSON.parse(readFileSync(join(skillDir, "evals.json"), "utf-8"));
 
-let cases = evals.output_evals;
+let outputCases = evals.output_evals || [];
 if (evalFilter) {
-  cases = cases.filter((c) => c.name.toLowerCase().includes(evalFilter.toLowerCase()));
-  if (cases.length === 0) {
+  outputCases = outputCases.filter((c) => c.name.toLowerCase().includes(evalFilter.toLowerCase()));
+  if (outputCases.length === 0 && !triggersOnly) {
     console.error(`No eval case matching "${evalFilter}"`);
     process.exit(1);
   }
@@ -127,82 +132,197 @@ ${behaviors}`;
   }
 }
 
+/** Build a skill catalog string from all skills in the repo. */
+function buildSkillCatalog() {
+  const skills = readAllSkills();
+  return skills
+    .map((s) => `- **${s.meta.name}**: ${s.meta.description}`)
+    .join("\n");
+}
+
+/** Run trigger evals — check if queries would correctly select the skill. */
+function runTriggerEvals(triggerEvals, targetSkill) {
+  const catalog = buildSkillCatalog();
+  const allQueries = [
+    ...(triggerEvals.should_trigger || []).map((q) => ({ query: q, expected: true })),
+    ...(triggerEvals.should_not_trigger || []).map((q) => ({ query: q, expected: false })),
+  ];
+
+  if (allQueries.length === 0) return null;
+
+  // Batch all queries into a single judge call for efficiency
+  const queryList = allQueries
+    .map((q, i) => `${i + 1}. "${q.query}"`)
+    .join("\n");
+
+  const triggerPrompt = `You are evaluating skill triggering for an agent skill catalog. Given a user query, determine which skill (if any) from the catalog below would be the best match.
+
+<skill_catalog>
+${catalog}
+</skill_catalog>
+
+For each query below, respond with the skill name that best matches, or "none" if no skill is a good fit. Return ONLY a JSON array of objects with "query" (string), "selected_skill" (string or "none"), and "reason" (one sentence).
+
+Queries:
+${queryList}`;
+
+  console.log("  Running trigger evaluation...");
+  const raw = runClaude(triggerPrompt, null);
+
+  const jsonMatch = raw.match(/\[[\s\S]*\]/);
+  if (!jsonMatch) {
+    console.error(`  [triggers] Judge returned non-JSON:\n${raw}\n`);
+    return null;
+  }
+
+  let selections;
+  try {
+    selections = JSON.parse(jsonMatch[0]);
+  } catch {
+    console.error(`  [triggers] Failed to parse judge JSON:\n${jsonMatch[0]}\n`);
+    return null;
+  }
+
+  // Score each query
+  const results = allQueries.map((q, i) => {
+    const selection = selections[i];
+    if (!selection) return { ...q, pass: false, selected: "error", reason: "No judge response" };
+
+    const selected = selection.selected_skill?.toLowerCase() || "none";
+    const isTarget = selected === targetSkill.toLowerCase();
+
+    const pass = q.expected ? isTarget : !isTarget;
+    return {
+      ...q,
+      pass,
+      selected: selection.selected_skill || "none",
+      reason: selection.reason || "",
+    };
+  });
+
+  return results;
+}
+
 // ---------------------------------------------------------------------------
-// Run
+// Run output evals
 // ---------------------------------------------------------------------------
-console.log(`\nEvaluating skill: ${skillName}`);
-console.log(`Cases: ${cases.map((c) => c.name).join(", ")}\n`);
+const allResults = { output_evals: [], trigger_evals: null };
 
-const results = [];
+if (!triggersOnly && outputCases.length > 0) {
+  console.log(`\nEvaluating skill: ${skillName}`);
+  console.log(`Output cases: ${outputCases.map((c) => c.name).join(", ")}\n`);
 
-for (const evalCase of cases) {
-  console.log(`━━━ ${evalCase.name} ━━━\n`);
+  for (const evalCase of outputCases) {
+    console.log(`━━━ ${evalCase.name} ━━━\n`);
 
-  // Run WITH skill
-  console.log("  Running WITH skill...");
-  const withOutput = runClaude(evalCase.prompt, skillContent);
+    // Run WITH skill
+    console.log("  Running WITH skill...");
+    const withOutput = runClaude(evalCase.prompt, skillContent);
 
-  // Run WITHOUT skill (baseline)
-  let withoutOutput = null;
-  if (!skipBaseline) {
-    console.log("  Running WITHOUT skill...");
-    withoutOutput = runClaude(evalCase.prompt, null);
-  }
+    // Run WITHOUT skill (baseline)
+    let withoutOutput = null;
+    if (!skipBaseline) {
+      console.log("  Running WITHOUT skill...");
+      withoutOutput = runClaude(evalCase.prompt, null);
+    }
 
-  // Judge
-  console.log("  Judging WITH skill...");
-  const withJudgment = judge(evalCase, withOutput, "with-skill");
+    // Judge
+    console.log("  Judging WITH skill...");
+    const withJudgment = judge(evalCase, withOutput, "with-skill");
 
-  let withoutJudgment = null;
-  if (withoutOutput) {
-    console.log("  Judging WITHOUT skill...");
-    withoutJudgment = judge(evalCase, withoutOutput, "without-skill");
-  }
+    let withoutJudgment = null;
+    if (withoutOutput) {
+      console.log("  Judging WITHOUT skill...");
+      withoutJudgment = judge(evalCase, withoutOutput, "without-skill");
+    }
 
-  // Print results
-  if (withJudgment) {
-    const passed = withJudgment.filter((j) => j.pass).length;
-    const total = withJudgment.length;
-    console.log(`\n  WITH skill: ${passed}/${total} passed`);
-    for (const j of withJudgment) {
-      console.log(`    ${j.pass ? "✅" : "❌"} ${j.behavior}`);
-      if (!j.pass) console.log(`       → ${j.reason}`);
+    // Print results
+    if (withJudgment) {
+      const passed = withJudgment.filter((j) => j.pass).length;
+      const total = withJudgment.length;
+      console.log(`\n  WITH skill: ${passed}/${total} passed`);
+      for (const j of withJudgment) {
+        console.log(`    ${j.pass ? "✅" : "❌"} ${j.behavior}`);
+        if (!j.pass) console.log(`       → ${j.reason}`);
+      }
     }
-  }
 
-  if (withoutJudgment) {
-    const passed = withoutJudgment.filter((j) => j.pass).length;
-    const total = withoutJudgment.length;
-    console.log(`\n  WITHOUT skill: ${passed}/${total} passed`);
-    for (const j of withoutJudgment) {
-      console.log(`    ${j.pass ? "✅" : "❌"} ${j.behavior}`);
-      if (!j.pass) console.log(`       → ${j.reason}`);
+    if (withoutJudgment) {
+      const passed = withoutJudgment.filter((j) => j.pass).length;
+      const total = withoutJudgment.length;
+      console.log(`\n  WITHOUT skill: ${passed}/${total} passed`);
+      for (const j of withoutJudgment) {
+        console.log(`    ${j.pass ? "✅" : "❌"} ${j.behavior}`);
+        if (!j.pass) console.log(`       → ${j.reason}`);
+      }
     }
+
+    allResults.output_evals.push({
+      name: evalCase.name,
+      with_skill: { output: withOutput, judgment: withJudgment },
+      without_skill: withoutOutput
+        ? { output: withoutOutput, judgment: withoutJudgment }
+        : null,
+    });
+
+    console.log("");
   }
+}
 
-  results.push({
-    name: evalCase.name,
-    with_skill: { output: withOutput, judgment: withJudgment },
-    without_skill: withoutOutput
-      ? { output: withoutOutput, judgment: withoutJudgment }
-      : null,
-  });
+// ---------------------------------------------------------------------------
+// Run trigger evals
+// ---------------------------------------------------------------------------
+if (evals.trigger_evals && !evalFilter) {
+  console.log(`━━━ Trigger Evals ━━━\n`);
 
-  console.log("");
+  const triggerResults = runTriggerEvals(evals.trigger_evals, skillName);
+  allResults.trigger_evals = triggerResults;
+
+  if (triggerResults) {
+    const shouldTrigger = triggerResults.filter((r) => r.expected);
+    const shouldNot = triggerResults.filter((r) => !r.expected);
+
+    const triggerPassed = shouldTrigger.filter((r) => r.pass).length;
+    const notTriggerPassed = shouldNot.filter((r) => r.pass).length;
+
+    console.log(`\n  Should trigger: ${triggerPassed}/${shouldTrigger.length} correct`);
+    for (const r of shouldTrigger) {
+      console.log(`    ${r.pass ? "✅" : "❌"} "${r.query}"`);
+      if (!r.pass) console.log(`       → selected "${r.selected}" instead — ${r.reason}`);
+    }
+
+    console.log(`\n  Should NOT trigger: ${notTriggerPassed}/${shouldNot.length} correct`);
+    for (const r of shouldNot) {
+      console.log(`    ${r.pass ? "✅" : "❌"} "${r.query}"`);
+      if (!r.pass) console.log(`       → incorrectly selected "${r.selected}" — ${r.reason}`);
+    }
+
+    console.log("");
+  }
 }
 
 // ---------------------------------------------------------------------------
 // Summary + save
 // ---------------------------------------------------------------------------
 console.log("━━━ Summary ━━━\n");
-for (const r of results) {
-  const withScore = r.with_skill.judgment
-    ? `${r.with_skill.judgment.filter((j) => j.pass).length}/${r.with_skill.judgment.length}`
-    : "error";
-  const withoutScore = r.without_skill?.judgment
-    ? `${r.without_skill.judgment.filter((j) => j.pass).length}/${r.without_skill.judgment.length}`
-    : "skipped";
-  console.log(`  ${r.name}: WITH ${withScore} | WITHOUT ${withoutScore}`);
+
+if (allResults.output_evals.length > 0) {
+  console.log("  Output evals:");
+  for (const r of allResults.output_evals) {
+    const withScore = r.with_skill.judgment
+      ? `${r.with_skill.judgment.filter((j) => j.pass).length}/${r.with_skill.judgment.length}`
+      : "error";
+    const withoutScore = r.without_skill?.judgment
+      ? `${r.without_skill.judgment.filter((j) => j.pass).length}/${r.without_skill.judgment.length}`
+      : "skipped";
+    console.log(`    ${r.name}: WITH ${withScore} | WITHOUT ${withoutScore}`);
+  }
+}
+
+if (allResults.trigger_evals) {
+  const shouldTrigger = allResults.trigger_evals.filter((r) => r.expected);
+  const shouldNot = allResults.trigger_evals.filter((r) => !r.expected);
+  console.log(`  Trigger evals: should-trigger ${shouldTrigger.filter((r) => r.pass).length}/${shouldTrigger.length} | should-not-trigger ${shouldNot.filter((r) => r.pass).length}/${shouldNot.length}`);
 }
 
 // Save full results
@@ -210,7 +330,7 @@ const outDir = join(ROOT, "skills", skillName, "eval-results");
 mkdirSync(outDir, { recursive: true });
 const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
 const outFile = join(outDir, `run-${timestamp}.json`);
-writeFileSync(outFile, JSON.stringify(results, null, 2));
+writeFileSync(outFile, JSON.stringify(allResults, null, 2));
 console.log(`\nFull results saved to: ${outFile}\n`);
 
 // Cleanup

From 96a65bf1385753ba3541cb012688ae2a0c17f0ce Mon Sep 17 00:00:00 2001
From: Marco Walz <marco.walz@dfinity.org>
Date: Thu, 5 Mar 2026 18:45:49 +0100
Subject: [PATCH 3/3] refactor: rename eval script, move evals out of skill
 directories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename scripts/run-evals.js → scripts/evaluate-skills.js
- Move skills/icp-cli/evals.json → evaluations/icp-cli.json
- Results now save to evaluations/results/ (gitignored)
- Validator checks evaluations/<name>.json instead of skill dirs
- Add --triggers-only flag for trigger-only evaluation runs
- Updated all references in CONTRIBUTING.md, CLAUDE.md, README.md

Keeps skill directories clean (one skill = one SKILL.md).
---
 .claude/CLAUDE.md                             | 11 +++++-----
 .gitignore                                    |  2 +-
 CONTRIBUTING.md                               | 13 ++++++------
 README.md                                     |  2 +-
 .../evals.json => evaluations/icp-cli.json    |  0
 scripts/{run-evals.js => evaluate-skills.js}  | 21 ++++++++++---------
 scripts/validate-skills.js                    |  5 +++--
 7 files changed, 29 insertions(+), 25 deletions(-)
 rename skills/icp-cli/evals.json => evaluations/icp-cli.json (100%)
 rename scripts/{run-evals.js => evaluate-skills.js} (93%)

diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
index c2840bc..c30ed6e 100644
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -41,13 +41,14 @@ Validate runs in CI and blocks deployment on errors.
 
 ## Evaluations
 
-Each skill should have an `evals.json` file with test cases. Run evaluations with:
+Each skill should have an evaluation file at `evaluations/<skill-name>.json`. Run evaluations with:
 ```bash
-node scripts/run-evals.js <skill-name>              # All evals
-node scripts/run-evals.js <skill-name> --eval "X"   # Single eval by name
-node scripts/run-evals.js <skill-name> --no-baseline # Skip without-skill baseline
+node scripts/evaluate-skills.js <skill-name>              # All evals
+node scripts/evaluate-skills.js <skill-name> --eval "X"   # Single eval by name
+node scripts/evaluate-skills.js <skill-name> --no-baseline # Skip without-skill baseline
+node scripts/evaluate-skills.js <skill-name> --triggers-only # Trigger evals only
 ```
-Results are saved to `skills/<skill-name>/eval-results/` (gitignored). See `skills/icp-cli/evals.json` for the format.
+Results are saved to `evaluations/results/` (gitignored). See `evaluations/icp-cli.json` for the format.
 
 ## Writing Guidelines
 
diff --git a/.gitignore b/.gitignore
index 7ac2ab2..a02b91e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,4 @@ public/llms-full.txt
 .astro
 lighthouse-*
 .eval-tmp
-skills/*/eval-results/
+evaluations/results/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7de05ec..0a6ea6a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -123,22 +123,23 @@ This runs automatically in CI and blocks deployment on errors.
 
 ### 4. Add evaluation cases
 
-Create `skills/<skill-name>/evals.json` with test cases that verify the skill works. The eval file has two sections:
+Create `evaluations/<skill-name>.json` with test cases that verify the skill works. The eval file has two sections:
 
 - **`output_evals`** — realistic prompts with expected behaviors a judge can check
 - **`trigger_evals`** — queries that should/shouldn't activate the skill
 
-See `skills/icp-cli/evals.json` for a working example. Write prompts the way a developer would actually ask — vague and incomplete, not over-specified test questions.
+See `evaluations/icp-cli.json` for a working example. Write prompts the way a developer would actually ask — vague and incomplete, not over-specified test questions.
 
 **Running evaluations** (optional, requires `claude` CLI):
 
 ```bash
-node scripts/run-evals.js <skill-name>                    # All evals, with + without skill
-node scripts/run-evals.js <skill-name> --eval "name"      # Single eval
-node scripts/run-evals.js <skill-name> --no-baseline       # Skip without-skill run
+node scripts/evaluate-skills.js <skill-name>                    # All evals, with + without skill
+node scripts/evaluate-skills.js <skill-name> --eval "name"      # Single eval
+node scripts/evaluate-skills.js <skill-name> --no-baseline       # Skip without-skill run
+node scripts/evaluate-skills.js <skill-name> --triggers-only     # Trigger evals only
 ```
 
-This sends each prompt to Claude with and without the skill, then has a judge score the output. Results are saved to `skills/<skill-name>/eval-results/` (gitignored).
+This sends each prompt to Claude with and without the skill, then has a judge score the output. Results are saved to `evaluations/results/` (gitignored).
 
 Including a summary of eval results in your PR description is recommended but not required — running evals needs `claude` CLI access and costs API credits.
 
diff --git a/README.md b/README.md
index 611fb75..a307323 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for how to add or update skills.
 - **Hosting**: GitHub Pages via Actions
 - **Skills**: Plain markdown files in `skills/*/SKILL.md`
 - **Validation**: Structural linter for frontmatter and code blocks (`npm run validate`)
-- **Evaluation**: Per-skill eval cases with LLM-as-judge scoring (`node scripts/run-evals.js <skill>`)
+- **Evaluation**: Per-skill eval cases with LLM-as-judge scoring (`node scripts/evaluate-skills.js <skill>`)
 - **Schema**: JSON Schema for frontmatter at `skills/skill.schema.json`
 - **SEO**: Per-skill meta tags, JSON-LD (TechArticle), sitemap, canonical URLs
 - **AI Agent Discovery**: `llms.txt`, `llms-full.txt`, `.well-known/agent.json`, per-skill `.md` endpoints
diff --git a/skills/icp-cli/evals.json b/evaluations/icp-cli.json
similarity index 100%
rename from skills/icp-cli/evals.json
rename to evaluations/icp-cli.json
diff --git a/scripts/run-evals.js b/scripts/evaluate-skills.js
similarity index 93%
rename from scripts/run-evals.js
rename to scripts/evaluate-skills.js
index cb44c33..7603200 100644
--- a/scripts/run-evals.js
+++ b/scripts/evaluate-skills.js
@@ -3,20 +3,20 @@
 /**
  * Skill evaluation runner.
  *
- * Runs output_evals and trigger_evals from a skill's evals.json.
+ * Runs output_evals and trigger_evals from evaluations/<skill-name>.json.
  * - Output evals: sends prompts to `claude` CLI with/without the skill,
  *   then uses a judge to score expected behaviors as pass/fail.
  * - Trigger evals: presents all skill descriptions to a judge and checks
  *   whether each query would correctly trigger (or not trigger) the skill.
  *
  * Usage:
- *   node scripts/run-evals.js <skill-name> [--eval <name>] [--no-baseline] [--triggers-only]
+ *   node scripts/evaluate-skills.js <skill-name> [--eval <name>] [--no-baseline] [--triggers-only]
  *
  * Examples:
- *   node scripts/run-evals.js icp-cli
- *   node scripts/run-evals.js icp-cli --eval "Deploy to mainnet"
- *   node scripts/run-evals.js icp-cli --no-baseline
- *   node scripts/run-evals.js icp-cli --triggers-only
+ *   node scripts/evaluate-skills.js icp-cli
+ *   node scripts/evaluate-skills.js icp-cli --eval "Deploy to mainnet"
+ *   node scripts/evaluate-skills.js icp-cli --no-baseline
+ *   node scripts/evaluate-skills.js icp-cli --triggers-only
  *
  * Requirements:
  *   - `claude` CLI installed and authenticated
@@ -35,7 +35,7 @@ const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, "");
 const args = process.argv.slice(2);
 const skillName = args.find((a) => !a.startsWith("--"));
 if (!skillName) {
-  console.error("Usage: node scripts/run-evals.js <skill-name> [--eval <name>] [--no-baseline] [--triggers-only]");
+  console.error("Usage: node scripts/evaluate-skills.js <skill-name> [--eval <name>] [--no-baseline] [--triggers-only]");
   process.exit(1);
 }
 
@@ -49,7 +49,8 @@ const triggersOnly = args.includes("--triggers-only");
 // ---------------------------------------------------------------------------
 const skillDir = join(ROOT, "skills", skillName);
 const skillContent = readFileSync(join(skillDir, "SKILL.md"), "utf-8");
-const evals = JSON.parse(readFileSync(join(skillDir, "evals.json"), "utf-8"));
+const evalsFile = join(ROOT, "evaluations", `${skillName}.json`);
+const evals = JSON.parse(readFileSync(evalsFile, "utf-8"));
 
 let outputCases = evals.output_evals || [];
 if (evalFilter) {
@@ -326,10 +327,10 @@ if (allResults.trigger_evals) {
 }
 
 // Save full results
-const outDir = join(ROOT, "skills", skillName, "eval-results");
+const outDir = join(ROOT, "evaluations", "results");
 mkdirSync(outDir, { recursive: true });
 const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
-const outFile = join(outDir, `run-${timestamp}.json`);
+const outFile = join(outDir, `${skillName}-${timestamp}.json`);
 writeFileSync(outFile, JSON.stringify(allResults, null, 2));
 console.log(`\nFull results saved to: ${outFile}\n`);
 
diff --git a/scripts/validate-skills.js b/scripts/validate-skills.js
index 9cee0d1..e4759a0 100644
--- a/scripts/validate-skills.js
+++ b/scripts/validate-skills.js
@@ -132,8 +132,9 @@ for (const skill of skills) {
   }
 
   // --- Evals validation ---
-  if (!existsSync(join(SKILLS_DIR, dir, "evals.json"))) {
-    warn(label, `missing evals.json — see CONTRIBUTING.md for evaluation guidance`);
+  const evalsDir = join(SKILLS_DIR, "..", "evaluations");
+  if (!existsSync(join(evalsDir, `${dir}.json`))) {
+    warn(label, `missing evaluations/${dir}.json — see CONTRIBUTING.md for evaluation guidance`);
   }
 }