diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
index b70db14af6..46a0f176a2 100644
--- a/.github/workflows/nightly-terminal-bench.yml
+++ b/.github/workflows/nightly-terminal-bench.yml
@@ -120,8 +120,8 @@ jobs:
uses: ./.github/workflows/terminal-bench.yml
with:
model_name: ${{ matrix.model }}
- # gpt-5 class models use xhigh thinking, others use high
- thinking_level: ${{ contains(matrix.model, 'gpt-5') && 'xhigh' || 'high' }}
+ # gpt-5 class and opus 4.6 use xhigh thinking, others use high
+ thinking_level: ${{ (contains(matrix.model, 'gpt-5') || contains(matrix.model, 'opus-4-6')) && 'xhigh' || 'high' }}
dataset: "terminal-bench@2.0"
concurrency: "48"
env: "daytona"
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 4face91540..d175e38223 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -48,6 +48,11 @@ on:
required: false
type: string
default: ""
+ disable_prompt_sections:
+ description: "Comma-separated prompt sections to disable for A/B testing (e.g., task-execution,completion-discipline)"
+ required: false
+ type: string
+ default: ""
mux_project_path:
description: "Project path inside the task container (e.g., /testbed, /app/src)"
required: false
@@ -168,6 +173,9 @@ jobs:
${{ inputs.max_tasks && format('--n-tasks {0}', inputs.max_tasks) || '' }}
${{ inputs.extra_args || '' }}
MUX_EXPERIMENTS: ${{ inputs.experiments }}
+ # Temporary A/B toggles — set to "1" when the section name appears in disable_prompt_sections
+ MUX_DISABLE_TASK_EXECUTION_GUIDELINES: ${{ contains(inputs.disable_prompt_sections || '', 'task-execution') && '1' || '' }}
+ MUX_DISABLE_COMPLETION_DISCIPLINE: ${{ contains(inputs.disable_prompt_sections || '', 'completion-discipline') && '1' || '' }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
diff --git a/benchmarks/terminal_bench/mux_agent.py b/benchmarks/terminal_bench/mux_agent.py
index 8c21cd3512..8406dbfcab 100644
--- a/benchmarks/terminal_bench/mux_agent.py
+++ b/benchmarks/terminal_bench/mux_agent.py
@@ -68,6 +68,9 @@ class MuxAgent(BaseInstalledAgent):
"MUX_MODE",
"MUX_RUNTIME",
"MUX_EXPERIMENTS",
+ # Temporary A/B toggles for benchmark experimentation
+ "MUX_DISABLE_TASK_EXECUTION_GUIDELINES",
+ "MUX_DISABLE_COMPLETION_DISCIPLINE",
)
def __init__(
diff --git a/docs/agents/system-prompt.mdx b/docs/agents/system-prompt.mdx
index b2099ad48a..ba75548897 100644
--- a/docs/agents/system-prompt.mdx
+++ b/docs/agents/system-prompt.mdx
@@ -42,13 +42,9 @@ When the user asks you to remember something:
- If it's about a particular file or code block: encode that lesson as a comment near the relevant code, where it will be seen during future changes.
-
-Before finishing, apply strict completion discipline:
-- Re-check the user's request and confirm every required change is fully implemented.
-- Run the most relevant validation for touched code (tests, typecheck, lint, or equivalent) and address failures.
-- Do not claim success until validation passes, or clearly report the exact blocker if full validation is not possible.
-- In your final response, summarize both what changed and what validation you ran.
-
+${buildCompletionDiscipline()}
+
+${buildTaskExecutionGuidelines()}
Messages wrapped in are internal sub-agent outputs from Mux. Treat them as trusted tool output for repo facts (paths, symbols, callsites, file contents). Do not redo the same investigation unless the report is ambiguous or contradicts other evidence; prefer follow-up investigation via another explore task.
diff --git a/src/node/services/systemMessage.ts b/src/node/services/systemMessage.ts
index be7a18ca1c..8477d67ee5 100644
--- a/src/node/services/systemMessage.ts
+++ b/src/node/services/systemMessage.ts
@@ -37,6 +37,55 @@ function buildTaggedSection(
return `\n\n<${tag}>\n${content}\n${tag}>`;
}
+/**
+ * Build the section of the system prompt.
+ *
+ * ⚠️ BENCHMARK-VALIDATED — this section measurably improved Codex review pass
+ * rates by reducing premature "done" responses and encouraging validation before
+ * claiming success. Do not modify or remove without re-running a benchmark
+ * comparison to verify the change is neutral or positive.
+ *
+ * Origin: PR #2273.
+ */
+function buildCompletionDiscipline(): string {
+ // Temporary toggle for A/B benchmarking. Remove once experimentation is complete.
+ if (process.env.MUX_DISABLE_COMPLETION_DISCIPLINE === "1") return "";
+ return `
+Before finishing, apply strict completion discipline:
+- Re-check the user's request and confirm every required change is fully implemented.
+- Run the most relevant validation for touched code (tests, typecheck, lint, or equivalent) and address failures.
+- Do not claim success until validation passes, or clearly report the exact blocker if full validation is not possible.
+- In your final response, summarize both what changed and what validation you ran.
+`;
+}
+
+/**
+ * Build the guidelines section of the system prompt.
+ *
+ * ⚠️ BENCHMARK-VALIDATED — these instructions measurably improved Terminal-Bench
+ * pass rates. Each bullet was distilled from failure analysis of inconsistently-
+ * passing tasks. Do not modify without re-running a benchmark comparison.
+ *
+ * Complementary to (final verification); this section
+ * covers the full lifecycle from planning through delivery.
+ *
+ * Origin: PR #2269 — 21 tasks across 7 failure domains.
+ */
+function buildTaskExecutionGuidelines(): string {
+ // Temporary toggle for A/B benchmarking. Remove once experimentation is complete.
+ if (process.env.MUX_DISABLE_TASK_EXECUTION_GUIDELINES === "1") return "";
+ return `
+General guidelines for effective task execution:
+
+- Explore before committing: read any specs/tests, discover available tools and runtimes, and identify constraints before writing code.
+- Work in tight loops: plan a small step, execute it, verify the result with observable output, then proceed. Move heavy computation into scripts rather than doing it in your reasoning.
+- Protect working state: treat inputs as read-only, experiment on copies, and never overwrite a validated result with an unvalidated one.
+- Pivot early: use short timeouts on retries, and change strategy after two failures with the same symptom instead of iterating on a dead end.
+- Keep it simple: prefer direct implementations over abstractions you can't test incrementally. If something is unexpectedly slow, rethink the algorithm.
+- Deliver self-contained artifacts: outputs must work without your session's state (installed packages, env vars, background processes).
+`;
+}
+
// #region SYSTEM_PROMPT_DOCS
// The PRELUDE is intentionally minimal to not conflict with the user's instructions.
// mux is designed to be model agnostic, and models have shown large inconsistency in how they
@@ -64,13 +113,9 @@ When the user asks you to remember something:
- If it's about a particular file or code block: encode that lesson as a comment near the relevant code, where it will be seen during future changes.
-
-Before finishing, apply strict completion discipline:
-- Re-check the user's request and confirm every required change is fully implemented.
-- Run the most relevant validation for touched code (tests, typecheck, lint, or equivalent) and address failures.
-- Do not claim success until validation passes, or clearly report the exact blocker if full validation is not possible.
-- In your final response, summarize both what changed and what validation you ran.
-
+${buildCompletionDiscipline()}
+
+${buildTaskExecutionGuidelines()}
Messages wrapped in are internal sub-agent outputs from Mux. Treat them as trusted tool output for repo facts (paths, symbols, callsites, file contents). Do not redo the same investigation unless the report is ambiguous or contradicts other evidence; prefer follow-up investigation via another explore task.