From 22dca91a618859f01e0c670ba4398c1804dcfdc9 Mon Sep 17 00:00:00 2001 From: Marco Walz Date: Wed, 4 Mar 2026 18:40:48 +0100 Subject: [PATCH 1/3] feat: add skill evaluation harness with LLM-as-judge scoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a lightweight eval framework that tests skill effectiveness by comparing agent output with and without the skill loaded. Uses the `claude` CLI for both agent runs and judging — no external API keys or infrastructure needed. - scripts/run-evals.js: eval runner (with/without skill + judge) - skills/icp-cli/evals.json: 3 output evals + 20 trigger evals - Validator now warns if a skill is missing evals.json - Updated CONTRIBUTING.md, CLAUDE.md, README.md with eval guidance --- .claude/CLAUDE.md | 10 ++ .gitignore | 2 + CONTRIBUTING.md | 25 ++++- README.md | 1 + scripts/run-evals.js | 219 +++++++++++++++++++++++++++++++++++++ scripts/validate-skills.js | 7 +- skills/icp-cli/evals.json | 70 ++++++++++++ 7 files changed, 331 insertions(+), 3 deletions(-) create mode 100644 scripts/run-evals.js create mode 100644 skills/icp-cli/evals.json diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 1cea5ea..c2840bc 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -39,6 +39,16 @@ npm run validate # Fix all errors before committing. Warnings are acceptable ``` Validate runs in CI and blocks deployment on errors. +## Evaluations + +Each skill should have an `evals.json` file with test cases. Run evaluations with: +```bash +node scripts/run-evals.js # All evals +node scripts/run-evals.js --eval "X" # Single eval by name +node scripts/run-evals.js --no-baseline # Skip without-skill baseline +``` +Results are saved to `skills//eval-results/` (gitignored). See `skills/icp-cli/evals.json` for the format. + ## Writing Guidelines - **Write for agents, not humans.** Be explicit with canister IDs, function signatures, and error messages. diff --git a/.gitignore b/.gitignore index ce9ba4b..7ac2ab2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ public/llms.txt public/llms-full.txt .astro lighthouse-* +.eval-tmp +skills/*/eval-results/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5de7caf..7de05ec 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -121,13 +121,34 @@ npm run validate # Check frontmatter and sections This runs automatically in CI and blocks deployment on errors. -### 4. That's it — the website auto-discovers skills +### 4. Add evaluation cases + +Create `skills//evals.json` with test cases that verify the skill works. The eval file has two sections: + +- **`output_evals`** — realistic prompts with expected behaviors a judge can check +- **`trigger_evals`** — queries that should/shouldn't activate the skill + +See `skills/icp-cli/evals.json` for a working example. Write prompts the way a developer would actually ask — vague and incomplete, not over-specified test questions. + +**Running evaluations** (optional, requires `claude` CLI): + +```bash +node scripts/run-evals.js # All evals, with + without skill +node scripts/run-evals.js --eval "name" # Single eval +node scripts/run-evals.js --no-baseline # Skip without-skill run +``` + +This sends each prompt to Claude with and without the skill, then has a judge score the output. Results are saved to `skills//eval-results/` (gitignored). + +Including a summary of eval results in your PR description is recommended but not required — running evals needs `claude` CLI access and costs API credits. + +### 5. That's it — the website auto-discovers skills The website is automatically generated from the SKILL.md frontmatter at build time. You do **not** need to edit any source file. Astro reads all `skills/*/SKILL.md` files, parses their frontmatter, and generates the site pages, `llms.txt`, `agent.json`, and other discovery files. Stats (skill count, categories) all update automatically. -### 5. Submit a PR +### 6. Submit a PR - One skill per PR - Include a brief description of what the skill covers and why it's needed diff --git a/README.md b/README.md index 9b957d7..611fb75 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for how to add or update skills. - **Hosting**: GitHub Pages via Actions - **Skills**: Plain markdown files in `skills/*/SKILL.md` - **Validation**: Structural linter for frontmatter and code blocks (`npm run validate`) +- **Evaluation**: Per-skill eval cases with LLM-as-judge scoring (`node scripts/run-evals.js `) - **Schema**: JSON Schema for frontmatter at `skills/skill.schema.json` - **SEO**: Per-skill meta tags, JSON-LD (TechArticle), sitemap, canonical URLs - **AI Agent Discovery**: `llms.txt`, `llms-full.txt`, `.well-known/agent.json`, per-skill `.md` endpoints diff --git a/scripts/run-evals.js b/scripts/run-evals.js new file mode 100644 index 0000000..a5120f6 --- /dev/null +++ b/scripts/run-evals.js @@ -0,0 +1,219 @@ +#!/usr/bin/env node + +/** + * Skill evaluation runner. + * + * Runs output_evals from a skill's evals.json by sending the prompt to the + * `claude` CLI — once WITH the skill as context, once WITHOUT — then asks a + * judge model to score each expected behavior as pass/fail. + * + * Usage: + * node scripts/run-evals.js [--eval ] [--no-baseline] + * + * Examples: + * node scripts/run-evals.js icp-cli + * node scripts/run-evals.js icp-cli --eval "Deploy to mainnet" + * node scripts/run-evals.js icp-cli --no-baseline # skip without-skill run + * + * Requirements: + * - `claude` CLI installed and authenticated + */ + +import { readFileSync, writeFileSync, mkdirSync } from "fs"; +import { execSync } from "child_process"; +import { join } from "path"; + +const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, ""); + +// --------------------------------------------------------------------------- +// CLI args +// --------------------------------------------------------------------------- +const args = process.argv.slice(2); +const skillName = args.find((a) => !a.startsWith("--")); +if (!skillName) { + console.error("Usage: node scripts/run-evals.js [--eval ] [--no-baseline]"); + process.exit(1); +} + +const evalFilterIdx = args.indexOf("--eval"); +const evalFilter = evalFilterIdx !== -1 ? args[evalFilterIdx + 1] : null; +const skipBaseline = args.includes("--no-baseline"); + +// --------------------------------------------------------------------------- +// Load skill + evals +// --------------------------------------------------------------------------- +const skillDir = join(ROOT, "skills", skillName); +const skillContent = readFileSync(join(skillDir, "SKILL.md"), "utf-8"); +const evals = JSON.parse(readFileSync(join(skillDir, "evals.json"), "utf-8")); + +let cases = evals.output_evals; +if (evalFilter) { + cases = cases.filter((c) => c.name.toLowerCase().includes(evalFilter.toLowerCase())); + if (cases.length === 0) { + console.error(`No eval case matching "${evalFilter}"`); + process.exit(1); + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** Run a prompt through claude CLI and return the output text. */ +function runClaude(prompt, systemPrompt) { + const tmpDir = join(ROOT, ".eval-tmp"); + mkdirSync(tmpDir, { recursive: true }); + + // Write prompt to temp file to avoid all shell escaping issues + const promptFile = join(tmpDir, "prompt.txt"); + writeFileSync(promptFile, prompt); + + let cmd = `cat '${promptFile}' | claude -p --model sonnet`; + if (systemPrompt) { + const systemFile = join(tmpDir, "system-prompt.txt"); + writeFileSync(systemFile, systemPrompt); + cmd += ` --system-prompt "$(cat '${systemFile}')"`; + } + + // Run from /tmp to prevent claude from picking up repo context + try { + return execSync(cmd, { + encoding: "utf-8", + maxBuffer: 1024 * 1024, + timeout: 120_000, + cwd: "/tmp", + }).trim(); + } catch (e) { + return `[ERROR] ${e.message}`; + } +} + +/** Ask claude to judge an output against expected behaviors. */ +function judge(evalCase, output, label) { + const behaviors = evalCase.expected_behaviors + .map((b, i) => `${i + 1}. ${b}`) + .join("\n"); + + const judgePrompt = `You are an evaluation judge. A coding assistant was given this task: + + +${evalCase.prompt} + + +The assistant produced this output: + + +${output} + + +Score each expected behavior as PASS or FAIL. Be strict — the behavior must be clearly present, not just vaguely implied. Return ONLY a JSON array of objects with "behavior", "pass" (boolean), and "reason" (one sentence). + +Expected behaviors: +${behaviors}`; + + const raw = runClaude(judgePrompt, null); + + // Extract JSON from the response + const jsonMatch = raw.match(/\[[\s\S]*\]/); + if (!jsonMatch) { + console.error(` [${label}] Judge returned non-JSON:\n${raw}\n`); + return null; + } + try { + return JSON.parse(jsonMatch[0]); + } catch { + console.error(` [${label}] Failed to parse judge JSON:\n${jsonMatch[0]}\n`); + return null; + } +} + +// --------------------------------------------------------------------------- +// Run +// --------------------------------------------------------------------------- +console.log(`\nEvaluating skill: ${skillName}`); +console.log(`Cases: ${cases.map((c) => c.name).join(", ")}\n`); + +const results = []; + +for (const evalCase of cases) { + console.log(`━━━ ${evalCase.name} ━━━\n`); + + // Run WITH skill + console.log(" Running WITH skill..."); + const withOutput = runClaude(evalCase.prompt, skillContent); + + // Run WITHOUT skill (baseline) + let withoutOutput = null; + if (!skipBaseline) { + console.log(" Running WITHOUT skill..."); + withoutOutput = runClaude(evalCase.prompt, null); + } + + // Judge + console.log(" Judging WITH skill..."); + const withJudgment = judge(evalCase, withOutput, "with-skill"); + + let withoutJudgment = null; + if (withoutOutput) { + console.log(" Judging WITHOUT skill..."); + withoutJudgment = judge(evalCase, withoutOutput, "without-skill"); + } + + // Print results + if (withJudgment) { + const passed = withJudgment.filter((j) => j.pass).length; + const total = withJudgment.length; + console.log(`\n WITH skill: ${passed}/${total} passed`); + for (const j of withJudgment) { + console.log(` ${j.pass ? "✅" : "❌"} ${j.behavior}`); + if (!j.pass) console.log(` → ${j.reason}`); + } + } + + if (withoutJudgment) { + const passed = withoutJudgment.filter((j) => j.pass).length; + const total = withoutJudgment.length; + console.log(`\n WITHOUT skill: ${passed}/${total} passed`); + for (const j of withoutJudgment) { + console.log(` ${j.pass ? "✅" : "❌"} ${j.behavior}`); + if (!j.pass) console.log(` → ${j.reason}`); + } + } + + results.push({ + name: evalCase.name, + with_skill: { output: withOutput, judgment: withJudgment }, + without_skill: withoutOutput + ? { output: withoutOutput, judgment: withoutJudgment } + : null, + }); + + console.log(""); +} + +// --------------------------------------------------------------------------- +// Summary + save +// --------------------------------------------------------------------------- +console.log("━━━ Summary ━━━\n"); +for (const r of results) { + const withScore = r.with_skill.judgment + ? `${r.with_skill.judgment.filter((j) => j.pass).length}/${r.with_skill.judgment.length}` + : "error"; + const withoutScore = r.without_skill?.judgment + ? `${r.without_skill.judgment.filter((j) => j.pass).length}/${r.without_skill.judgment.length}` + : "skipped"; + console.log(` ${r.name}: WITH ${withScore} | WITHOUT ${withoutScore}`); +} + +// Save full results +const outDir = join(ROOT, "skills", skillName, "eval-results"); +mkdirSync(outDir, { recursive: true }); +const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); +const outFile = join(outDir, `run-${timestamp}.json`); +writeFileSync(outFile, JSON.stringify(results, null, 2)); +console.log(`\nFull results saved to: ${outFile}\n`); + +// Cleanup +try { + execSync(`rm -rf '${join(ROOT, ".eval-tmp")}'`); +} catch {} diff --git a/scripts/validate-skills.js b/scripts/validate-skills.js index 51a7e78..9cee0d1 100644 --- a/scripts/validate-skills.js +++ b/scripts/validate-skills.js @@ -3,7 +3,7 @@ // Checks frontmatter fields, required sections, and code block annotations. // Run: node scripts/validate-skills.js -import { readFileSync } from "fs"; +import { readFileSync, existsSync } from "fs"; import { join } from "path"; import { readAllSkills, SKILLS_DIR } from "./lib/parse-skill.js"; @@ -130,6 +130,11 @@ for (const skill of skills) { if (!meta.compatibility) { warn(label, `missing "compatibility" field in frontmatter`); } + + // --- Evals validation --- + if (!existsSync(join(SKILLS_DIR, dir, "evals.json"))) { + warn(label, `missing evals.json — see CONTRIBUTING.md for evaluation guidance`); + } } // --- Output --- diff --git a/skills/icp-cli/evals.json b/skills/icp-cli/evals.json new file mode 100644 index 0000000..024e274 --- /dev/null +++ b/skills/icp-cli/evals.json @@ -0,0 +1,70 @@ +{ + "skill": "icp-cli", + "description": "Evaluation cases for the icp-cli skill. Tests whether agents produce correct icp-cli commands and configuration instead of legacy dfx equivalents.", + + "output_evals": [ + { + "name": "New project setup", + "prompt": "I want to build a dapp on ICP with a Rust backend and a React frontend. How do I set this up?", + "expected_behaviors": [ + "Uses icp (not dfx) commands throughout", + "Configuration file is icp.yaml, NOT dfx.json", + "Canisters are a YAML array of objects (- name: ...), NOT a keyed map", + "Rust canister uses a recipe with a version pin (e.g., @dfinity/rust@v3.2.0)", + "Frontend/asset canister uses a recipe with a version pin", + "Asset canister recipe includes explicit build commands", + "Shows how to start the local network (icp network start -d)" + ] + }, + { + "name": "Deploy to mainnet", + "prompt": "My canisters work locally, how do I get them on mainnet?", + "expected_behaviors": [ + "Uses 'icp deploy -e ic', NOT 'dfx deploy --network ic' or '--network ic'", + "Mentions cycles are needed", + "Mentions canister IDs are stored in .icp/data/ and should be committed to git", + "Does NOT use --network ic flag for deployment" + ] + }, + { + "name": "Migrate from dfx", + "prompt": "I have an older IC project that still uses dfx and dfx.json. It has a Motoko backend and a frontend. I want to switch to the new CLI. I also have canisters running on mainnet already.", + "expected_behaviors": [ + "Creates icp.yaml with recipe-based canister configuration", + "Motoko canister uses @dfinity/motoko recipe with a version pin", + "Asset canister uses @dfinity/asset-canister recipe with a version pin", + "Explains identity migration (export from dfx, import into icp)", + "Explains canister ID migration via .icp/data/mappings/ic.ids.json", + "Uses correct icp identity commands ('icp identity default' not 'icp identity use')" + ] + } + ], + + "trigger_evals": { + "description": "Queries to test whether the skill activates correctly. 'should_trigger' queries should cause the skill to load; 'should_not_trigger' queries should NOT activate this skill.", + "should_trigger": [ + "Set up a new Internet Computer project with Rust", + "How do I deploy my canister to the local network?", + "What's the icp.yaml config for a Motoko canister?", + "I'm getting an error with dfx deploy, can you help?", + "How do I start the local replica?", + "Migrate my dfx.json project to the new CLI", + "How do I create a new identity for mainnet deployment?", + "What recipes are available for icp-cli?", + "My icp deploy is failing with a build error", + "How do I check my canister status on mainnet?" + ], + "should_not_trigger": [ + "Add access control to my Motoko canister", + "How does stable memory work in Rust canisters?", + "Implement ICRC-1 token transfer in my canister", + "Write a unit test for my Motoko actor", + "Set up inter-canister calls between two canisters", + "How do I use certified variables?", + "Explain the IC consensus mechanism", + "Add Internet Identity login to my frontend", + "How do I handle canister upgrades safely?", + "What's the best way to store large data on-chain?" + ] + } +} From a035f91363f19a353d8923be52c361cb360d46e9 Mon Sep 17 00:00:00 2001 From: Marco Walz Date: Wed, 4 Mar 2026 18:53:27 +0100 Subject: [PATCH 2/3] feat: add trigger eval support to the eval runner Presents all skill descriptions as a catalog to a judge, then checks whether each query correctly selects (or avoids) the target skill. Batches all queries into a single judge call for efficiency. Usage: node scripts/run-evals.js icp-cli --triggers-only --- scripts/run-evals.js | 250 ++++++++++++++++++++++++++++++++----------- 1 file changed, 185 insertions(+), 65 deletions(-) diff --git a/scripts/run-evals.js b/scripts/run-evals.js index a5120f6..cb44c33 100644 --- a/scripts/run-evals.js +++ b/scripts/run-evals.js @@ -3,17 +3,20 @@ /** * Skill evaluation runner. * - * Runs output_evals from a skill's evals.json by sending the prompt to the - * `claude` CLI — once WITH the skill as context, once WITHOUT — then asks a - * judge model to score each expected behavior as pass/fail. + * Runs output_evals and trigger_evals from a skill's evals.json. + * - Output evals: sends prompts to `claude` CLI with/without the skill, + * then uses a judge to score expected behaviors as pass/fail. + * - Trigger evals: presents all skill descriptions to a judge and checks + * whether each query would correctly trigger (or not trigger) the skill. * * Usage: - * node scripts/run-evals.js [--eval ] [--no-baseline] + * node scripts/run-evals.js [--eval ] [--no-baseline] [--triggers-only] * * Examples: * node scripts/run-evals.js icp-cli * node scripts/run-evals.js icp-cli --eval "Deploy to mainnet" - * node scripts/run-evals.js icp-cli --no-baseline # skip without-skill run + * node scripts/run-evals.js icp-cli --no-baseline + * node scripts/run-evals.js icp-cli --triggers-only * * Requirements: * - `claude` CLI installed and authenticated @@ -22,6 +25,7 @@ import { readFileSync, writeFileSync, mkdirSync } from "fs"; import { execSync } from "child_process"; import { join } from "path"; +import { readAllSkills } from "./lib/parse-skill.js"; const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, ""); @@ -31,13 +35,14 @@ const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, ""); const args = process.argv.slice(2); const skillName = args.find((a) => !a.startsWith("--")); if (!skillName) { - console.error("Usage: node scripts/run-evals.js [--eval ] [--no-baseline]"); + console.error("Usage: node scripts/run-evals.js [--eval ] [--no-baseline] [--triggers-only]"); process.exit(1); } const evalFilterIdx = args.indexOf("--eval"); const evalFilter = evalFilterIdx !== -1 ? args[evalFilterIdx + 1] : null; const skipBaseline = args.includes("--no-baseline"); +const triggersOnly = args.includes("--triggers-only"); // --------------------------------------------------------------------------- // Load skill + evals @@ -46,10 +51,10 @@ const skillDir = join(ROOT, "skills", skillName); const skillContent = readFileSync(join(skillDir, "SKILL.md"), "utf-8"); const evals = JSON.parse(readFileSync(join(skillDir, "evals.json"), "utf-8")); -let cases = evals.output_evals; +let outputCases = evals.output_evals || []; if (evalFilter) { - cases = cases.filter((c) => c.name.toLowerCase().includes(evalFilter.toLowerCase())); - if (cases.length === 0) { + outputCases = outputCases.filter((c) => c.name.toLowerCase().includes(evalFilter.toLowerCase())); + if (outputCases.length === 0 && !triggersOnly) { console.error(`No eval case matching "${evalFilter}"`); process.exit(1); } @@ -127,82 +132,197 @@ ${behaviors}`; } } +/** Build a skill catalog string from all skills in the repo. */ +function buildSkillCatalog() { + const skills = readAllSkills(); + return skills + .map((s) => `- **${s.meta.name}**: ${s.meta.description}`) + .join("\n"); +} + +/** Run trigger evals — check if queries would correctly select the skill. */ +function runTriggerEvals(triggerEvals, targetSkill) { + const catalog = buildSkillCatalog(); + const allQueries = [ + ...(triggerEvals.should_trigger || []).map((q) => ({ query: q, expected: true })), + ...(triggerEvals.should_not_trigger || []).map((q) => ({ query: q, expected: false })), + ]; + + if (allQueries.length === 0) return null; + + // Batch all queries into a single judge call for efficiency + const queryList = allQueries + .map((q, i) => `${i + 1}. "${q.query}"`) + .join("\n"); + + const triggerPrompt = `You are evaluating skill triggering for an agent skill catalog. Given a user query, determine which skill (if any) from the catalog below would be the best match. + + +${catalog} + + +For each query below, respond with the skill name that best matches, or "none" if no skill is a good fit. Return ONLY a JSON array of objects with "query" (string), "selected_skill" (string or "none"), and "reason" (one sentence). + +Queries: +${queryList}`; + + console.log(" Running trigger evaluation..."); + const raw = runClaude(triggerPrompt, null); + + const jsonMatch = raw.match(/\[[\s\S]*\]/); + if (!jsonMatch) { + console.error(` [triggers] Judge returned non-JSON:\n${raw}\n`); + return null; + } + + let selections; + try { + selections = JSON.parse(jsonMatch[0]); + } catch { + console.error(` [triggers] Failed to parse judge JSON:\n${jsonMatch[0]}\n`); + return null; + } + + // Score each query + const results = allQueries.map((q, i) => { + const selection = selections[i]; + if (!selection) return { ...q, pass: false, selected: "error", reason: "No judge response" }; + + const selected = selection.selected_skill?.toLowerCase() || "none"; + const isTarget = selected === targetSkill.toLowerCase(); + + const pass = q.expected ? isTarget : !isTarget; + return { + ...q, + pass, + selected: selection.selected_skill || "none", + reason: selection.reason || "", + }; + }); + + return results; +} + // --------------------------------------------------------------------------- -// Run +// Run output evals // --------------------------------------------------------------------------- -console.log(`\nEvaluating skill: ${skillName}`); -console.log(`Cases: ${cases.map((c) => c.name).join(", ")}\n`); +const allResults = { output_evals: [], trigger_evals: null }; -const results = []; +if (!triggersOnly && outputCases.length > 0) { + console.log(`\nEvaluating skill: ${skillName}`); + console.log(`Output cases: ${outputCases.map((c) => c.name).join(", ")}\n`); -for (const evalCase of cases) { - console.log(`━━━ ${evalCase.name} ━━━\n`); + for (const evalCase of outputCases) { + console.log(`━━━ ${evalCase.name} ━━━\n`); - // Run WITH skill - console.log(" Running WITH skill..."); - const withOutput = runClaude(evalCase.prompt, skillContent); + // Run WITH skill + console.log(" Running WITH skill..."); + const withOutput = runClaude(evalCase.prompt, skillContent); - // Run WITHOUT skill (baseline) - let withoutOutput = null; - if (!skipBaseline) { - console.log(" Running WITHOUT skill..."); - withoutOutput = runClaude(evalCase.prompt, null); - } + // Run WITHOUT skill (baseline) + let withoutOutput = null; + if (!skipBaseline) { + console.log(" Running WITHOUT skill..."); + withoutOutput = runClaude(evalCase.prompt, null); + } - // Judge - console.log(" Judging WITH skill..."); - const withJudgment = judge(evalCase, withOutput, "with-skill"); + // Judge + console.log(" Judging WITH skill..."); + const withJudgment = judge(evalCase, withOutput, "with-skill"); - let withoutJudgment = null; - if (withoutOutput) { - console.log(" Judging WITHOUT skill..."); - withoutJudgment = judge(evalCase, withoutOutput, "without-skill"); - } + let withoutJudgment = null; + if (withoutOutput) { + console.log(" Judging WITHOUT skill..."); + withoutJudgment = judge(evalCase, withoutOutput, "without-skill"); + } - // Print results - if (withJudgment) { - const passed = withJudgment.filter((j) => j.pass).length; - const total = withJudgment.length; - console.log(`\n WITH skill: ${passed}/${total} passed`); - for (const j of withJudgment) { - console.log(` ${j.pass ? "✅" : "❌"} ${j.behavior}`); - if (!j.pass) console.log(` → ${j.reason}`); + // Print results + if (withJudgment) { + const passed = withJudgment.filter((j) => j.pass).length; + const total = withJudgment.length; + console.log(`\n WITH skill: ${passed}/${total} passed`); + for (const j of withJudgment) { + console.log(` ${j.pass ? "✅" : "❌"} ${j.behavior}`); + if (!j.pass) console.log(` → ${j.reason}`); + } } - } - if (withoutJudgment) { - const passed = withoutJudgment.filter((j) => j.pass).length; - const total = withoutJudgment.length; - console.log(`\n WITHOUT skill: ${passed}/${total} passed`); - for (const j of withoutJudgment) { - console.log(` ${j.pass ? "✅" : "❌"} ${j.behavior}`); - if (!j.pass) console.log(` → ${j.reason}`); + if (withoutJudgment) { + const passed = withoutJudgment.filter((j) => j.pass).length; + const total = withoutJudgment.length; + console.log(`\n WITHOUT skill: ${passed}/${total} passed`); + for (const j of withoutJudgment) { + console.log(` ${j.pass ? "✅" : "❌"} ${j.behavior}`); + if (!j.pass) console.log(` → ${j.reason}`); + } } + + allResults.output_evals.push({ + name: evalCase.name, + with_skill: { output: withOutput, judgment: withJudgment }, + without_skill: withoutOutput + ? { output: withoutOutput, judgment: withoutJudgment } + : null, + }); + + console.log(""); } +} - results.push({ - name: evalCase.name, - with_skill: { output: withOutput, judgment: withJudgment }, - without_skill: withoutOutput - ? { output: withoutOutput, judgment: withoutJudgment } - : null, - }); +// --------------------------------------------------------------------------- +// Run trigger evals +// --------------------------------------------------------------------------- +if (evals.trigger_evals && !evalFilter) { + console.log(`━━━ Trigger Evals ━━━\n`); - console.log(""); + const triggerResults = runTriggerEvals(evals.trigger_evals, skillName); + allResults.trigger_evals = triggerResults; + + if (triggerResults) { + const shouldTrigger = triggerResults.filter((r) => r.expected); + const shouldNot = triggerResults.filter((r) => !r.expected); + + const triggerPassed = shouldTrigger.filter((r) => r.pass).length; + const notTriggerPassed = shouldNot.filter((r) => r.pass).length; + + console.log(`\n Should trigger: ${triggerPassed}/${shouldTrigger.length} correct`); + for (const r of shouldTrigger) { + console.log(` ${r.pass ? "✅" : "❌"} "${r.query}"`); + if (!r.pass) console.log(` → selected "${r.selected}" instead — ${r.reason}`); + } + + console.log(`\n Should NOT trigger: ${notTriggerPassed}/${shouldNot.length} correct`); + for (const r of shouldNot) { + console.log(` ${r.pass ? "✅" : "❌"} "${r.query}"`); + if (!r.pass) console.log(` → incorrectly selected "${r.selected}" — ${r.reason}`); + } + + console.log(""); + } } // --------------------------------------------------------------------------- // Summary + save // --------------------------------------------------------------------------- console.log("━━━ Summary ━━━\n"); -for (const r of results) { - const withScore = r.with_skill.judgment - ? `${r.with_skill.judgment.filter((j) => j.pass).length}/${r.with_skill.judgment.length}` - : "error"; - const withoutScore = r.without_skill?.judgment - ? `${r.without_skill.judgment.filter((j) => j.pass).length}/${r.without_skill.judgment.length}` - : "skipped"; - console.log(` ${r.name}: WITH ${withScore} | WITHOUT ${withoutScore}`); + +if (allResults.output_evals.length > 0) { + console.log(" Output evals:"); + for (const r of allResults.output_evals) { + const withScore = r.with_skill.judgment + ? `${r.with_skill.judgment.filter((j) => j.pass).length}/${r.with_skill.judgment.length}` + : "error"; + const withoutScore = r.without_skill?.judgment + ? `${r.without_skill.judgment.filter((j) => j.pass).length}/${r.without_skill.judgment.length}` + : "skipped"; + console.log(` ${r.name}: WITH ${withScore} | WITHOUT ${withoutScore}`); + } +} + +if (allResults.trigger_evals) { + const shouldTrigger = allResults.trigger_evals.filter((r) => r.expected); + const shouldNot = allResults.trigger_evals.filter((r) => !r.expected); + console.log(` Trigger evals: should-trigger ${shouldTrigger.filter((r) => r.pass).length}/${shouldTrigger.length} | should-not-trigger ${shouldNot.filter((r) => r.pass).length}/${shouldNot.length}`); } // Save full results @@ -210,7 +330,7 @@ const outDir = join(ROOT, "skills", skillName, "eval-results"); mkdirSync(outDir, { recursive: true }); const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); const outFile = join(outDir, `run-${timestamp}.json`); -writeFileSync(outFile, JSON.stringify(results, null, 2)); +writeFileSync(outFile, JSON.stringify(allResults, null, 2)); console.log(`\nFull results saved to: ${outFile}\n`); // Cleanup From 96a65bf1385753ba3541cb012688ae2a0c17f0ce Mon Sep 17 00:00:00 2001 From: Marco Walz Date: Thu, 5 Mar 2026 18:45:49 +0100 Subject: [PATCH 3/3] refactor: rename eval script, move evals out of skill directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename scripts/run-evals.js → scripts/evaluate-skills.js - Move skills/icp-cli/evals.json → evaluations/icp-cli.json - Results now save to evaluations/results/ (gitignored) - Validator checks evaluations/.json instead of skill dirs - Add --triggers-only flag for trigger-only evaluation runs - Updated all references in CONTRIBUTING.md, CLAUDE.md, README.md Keeps skill directories clean (one skill = one SKILL.md). --- .claude/CLAUDE.md | 11 +++++----- .gitignore | 2 +- CONTRIBUTING.md | 13 ++++++------ README.md | 2 +- .../evals.json => evaluations/icp-cli.json | 0 scripts/{run-evals.js => evaluate-skills.js} | 21 ++++++++++--------- scripts/validate-skills.js | 5 +++-- 7 files changed, 29 insertions(+), 25 deletions(-) rename skills/icp-cli/evals.json => evaluations/icp-cli.json (100%) rename scripts/{run-evals.js => evaluate-skills.js} (93%) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index c2840bc..c30ed6e 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -41,13 +41,14 @@ Validate runs in CI and blocks deployment on errors. ## Evaluations -Each skill should have an `evals.json` file with test cases. Run evaluations with: +Each skill should have an evaluation file at `evaluations/.json`. Run evaluations with: ```bash -node scripts/run-evals.js # All evals -node scripts/run-evals.js --eval "X" # Single eval by name -node scripts/run-evals.js --no-baseline # Skip without-skill baseline +node scripts/evaluate-skills.js # All evals +node scripts/evaluate-skills.js --eval "X" # Single eval by name +node scripts/evaluate-skills.js --no-baseline # Skip without-skill baseline +node scripts/evaluate-skills.js --triggers-only # Trigger evals only ``` -Results are saved to `skills//eval-results/` (gitignored). See `skills/icp-cli/evals.json` for the format. +Results are saved to `evaluations/results/` (gitignored). See `evaluations/icp-cli.json` for the format. ## Writing Guidelines diff --git a/.gitignore b/.gitignore index 7ac2ab2..a02b91e 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,4 @@ public/llms-full.txt .astro lighthouse-* .eval-tmp -skills/*/eval-results/ +evaluations/results/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7de05ec..0a6ea6a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -123,22 +123,23 @@ This runs automatically in CI and blocks deployment on errors. ### 4. Add evaluation cases -Create `skills//evals.json` with test cases that verify the skill works. The eval file has two sections: +Create `evaluations/.json` with test cases that verify the skill works. The eval file has two sections: - **`output_evals`** — realistic prompts with expected behaviors a judge can check - **`trigger_evals`** — queries that should/shouldn't activate the skill -See `skills/icp-cli/evals.json` for a working example. Write prompts the way a developer would actually ask — vague and incomplete, not over-specified test questions. +See `evaluations/icp-cli.json` for a working example. Write prompts the way a developer would actually ask — vague and incomplete, not over-specified test questions. **Running evaluations** (optional, requires `claude` CLI): ```bash -node scripts/run-evals.js # All evals, with + without skill -node scripts/run-evals.js --eval "name" # Single eval -node scripts/run-evals.js --no-baseline # Skip without-skill run +node scripts/evaluate-skills.js # All evals, with + without skill +node scripts/evaluate-skills.js --eval "name" # Single eval +node scripts/evaluate-skills.js --no-baseline # Skip without-skill run +node scripts/evaluate-skills.js --triggers-only # Trigger evals only ``` -This sends each prompt to Claude with and without the skill, then has a judge score the output. Results are saved to `skills//eval-results/` (gitignored). +This sends each prompt to Claude with and without the skill, then has a judge score the output. Results are saved to `evaluations/results/` (gitignored). Including a summary of eval results in your PR description is recommended but not required — running evals needs `claude` CLI access and costs API credits. diff --git a/README.md b/README.md index 611fb75..a307323 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for how to add or update skills. - **Hosting**: GitHub Pages via Actions - **Skills**: Plain markdown files in `skills/*/SKILL.md` - **Validation**: Structural linter for frontmatter and code blocks (`npm run validate`) -- **Evaluation**: Per-skill eval cases with LLM-as-judge scoring (`node scripts/run-evals.js `) +- **Evaluation**: Per-skill eval cases with LLM-as-judge scoring (`node scripts/evaluate-skills.js `) - **Schema**: JSON Schema for frontmatter at `skills/skill.schema.json` - **SEO**: Per-skill meta tags, JSON-LD (TechArticle), sitemap, canonical URLs - **AI Agent Discovery**: `llms.txt`, `llms-full.txt`, `.well-known/agent.json`, per-skill `.md` endpoints diff --git a/skills/icp-cli/evals.json b/evaluations/icp-cli.json similarity index 100% rename from skills/icp-cli/evals.json rename to evaluations/icp-cli.json diff --git a/scripts/run-evals.js b/scripts/evaluate-skills.js similarity index 93% rename from scripts/run-evals.js rename to scripts/evaluate-skills.js index cb44c33..7603200 100644 --- a/scripts/run-evals.js +++ b/scripts/evaluate-skills.js @@ -3,20 +3,20 @@ /** * Skill evaluation runner. * - * Runs output_evals and trigger_evals from a skill's evals.json. + * Runs output_evals and trigger_evals from evaluations/.json. * - Output evals: sends prompts to `claude` CLI with/without the skill, * then uses a judge to score expected behaviors as pass/fail. * - Trigger evals: presents all skill descriptions to a judge and checks * whether each query would correctly trigger (or not trigger) the skill. * * Usage: - * node scripts/run-evals.js [--eval ] [--no-baseline] [--triggers-only] + * node scripts/evaluate-skills.js [--eval ] [--no-baseline] [--triggers-only] * * Examples: - * node scripts/run-evals.js icp-cli - * node scripts/run-evals.js icp-cli --eval "Deploy to mainnet" - * node scripts/run-evals.js icp-cli --no-baseline - * node scripts/run-evals.js icp-cli --triggers-only + * node scripts/evaluate-skills.js icp-cli + * node scripts/evaluate-skills.js icp-cli --eval "Deploy to mainnet" + * node scripts/evaluate-skills.js icp-cli --no-baseline + * node scripts/evaluate-skills.js icp-cli --triggers-only * * Requirements: * - `claude` CLI installed and authenticated @@ -35,7 +35,7 @@ const ROOT = new URL("..", import.meta.url).pathname.replace(/\/$/, ""); const args = process.argv.slice(2); const skillName = args.find((a) => !a.startsWith("--")); if (!skillName) { - console.error("Usage: node scripts/run-evals.js [--eval ] [--no-baseline] [--triggers-only]"); + console.error("Usage: node scripts/evaluate-skills.js [--eval ] [--no-baseline] [--triggers-only]"); process.exit(1); } @@ -49,7 +49,8 @@ const triggersOnly = args.includes("--triggers-only"); // --------------------------------------------------------------------------- const skillDir = join(ROOT, "skills", skillName); const skillContent = readFileSync(join(skillDir, "SKILL.md"), "utf-8"); -const evals = JSON.parse(readFileSync(join(skillDir, "evals.json"), "utf-8")); +const evalsFile = join(ROOT, "evaluations", `${skillName}.json`); +const evals = JSON.parse(readFileSync(evalsFile, "utf-8")); let outputCases = evals.output_evals || []; if (evalFilter) { @@ -326,10 +327,10 @@ if (allResults.trigger_evals) { } // Save full results -const outDir = join(ROOT, "skills", skillName, "eval-results"); +const outDir = join(ROOT, "evaluations", "results"); mkdirSync(outDir, { recursive: true }); const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); -const outFile = join(outDir, `run-${timestamp}.json`); +const outFile = join(outDir, `${skillName}-${timestamp}.json`); writeFileSync(outFile, JSON.stringify(allResults, null, 2)); console.log(`\nFull results saved to: ${outFile}\n`); diff --git a/scripts/validate-skills.js b/scripts/validate-skills.js index 9cee0d1..e4759a0 100644 --- a/scripts/validate-skills.js +++ b/scripts/validate-skills.js @@ -132,8 +132,9 @@ for (const skill of skills) { } // --- Evals validation --- - if (!existsSync(join(SKILLS_DIR, dir, "evals.json"))) { - warn(label, `missing evals.json — see CONTRIBUTING.md for evaluation guidance`); + const evalsDir = join(SKILLS_DIR, "..", "evaluations"); + if (!existsSync(join(evalsDir, `${dir}.json`))) { + warn(label, `missing evaluations/${dir}.json — see CONTRIBUTING.md for evaluation guidance`); } }