feat(ce-optimize): Auto-research loop for tuning system prompts / vector clustering / evaluating different code solution / etc (#446)

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 23:16:09 -04:00
parent 4e0ed2cc8d
commit 8f20aa0406
15 changed files with 3970 additions and 1 deletions
--- a/plugins/compound-engineering/skills/ce-optimize/references/example-hard-spec.yaml
+++ b/plugins/compound-engineering/skills/ce-optimize/references/example-hard-spec.yaml
@@ -0,0 +1,64 @@
+# Minimal first-run template for objective metrics.
+# Start here when "better" is a scalar value from the measurement harness.
+
+name: improve-build-latency
+description: Reduce build latency without regressing correctness
+
+metric:
+  primary:
+    type: hard
+    name: build_seconds
+    direction: minimize
+  degenerate_gates:
+    - name: build_passed
+      check: "== 1"
+      description: The build must stay green
+    - name: test_pass_rate
+      check: ">= 1.0"
+      description: Required tests must keep passing
+  diagnostics:
+    - name: artifact_size_mb
+    - name: peak_memory_mb
+
+measurement:
+  command: "python evaluate.py"
+  timeout_seconds: 300
+  working_directory: "tools/eval"
+  stability:
+    mode: repeat
+    repeat_count: 3
+    aggregation: median
+    noise_threshold: 0.05
+
+scope:
+  mutable:
+    - "src/build/"
+    - "config/build.yaml"
+  immutable:
+    - "tools/eval/evaluate.py"
+    - "tests/fixtures/"
+    - "scripts/ci/"
+
+execution:
+  mode: serial
+  backend: worktree
+  max_concurrent: 1
+
+parallel:
+  port_strategy: none
+  shared_files: []
+
+dependencies:
+  approved: []
+
+constraints:
+  - "Keep output artifacts backward compatible"
+  - "Do not skip required validation steps"
+
+stopping:
+  max_iterations: 4
+  max_hours: 1
+  plateau_iterations: 3
+  target_reached: true
+
+max_runner_up_merges_per_batch: 0
--- a/plugins/compound-engineering/skills/ce-optimize/references/example-judge-spec.yaml
+++ b/plugins/compound-engineering/skills/ce-optimize/references/example-judge-spec.yaml
@@ -0,0 +1,78 @@
+# Minimal first-run template for qualitative metrics.
+# Start here when true quality requires semantic judgment, not a proxy metric.
+
+name: improve-search-relevance
+description: Improve semantic relevance of search results without obvious failures
+
+metric:
+  primary:
+    type: judge
+    name: mean_score
+    direction: maximize
+  degenerate_gates:
+    - name: result_count
+      check: ">= 5"
+      description: Return enough results to judge quality
+    - name: empty_query_failures
+      check: "== 0"
+      description: Empty or trivial queries must not fail
+  diagnostics:
+    - name: latency_ms
+    - name: recall_at_10
+  judge:
+    rubric: |
+      Rate each result set from 1-5 for relevance:
+      - 5: Results are directly relevant and well ordered
+      - 4: Mostly relevant with minor ordering issues
+      - 3: Mixed relevance or one obvious miss
+      - 2: Weak relevance, several misses, or poor ordering
+      - 1: Mostly irrelevant
+      Also report: ambiguous (boolean)
+    scoring:
+      primary: mean_score
+      secondary:
+        - ambiguous_rate
+    model: haiku
+    sample_size: 10
+    batch_size: 5
+    sample_seed: 42
+    minimum_improvement: 0.2
+    max_total_cost_usd: 5
+
+measurement:
+  command: "python eval_search.py"
+  timeout_seconds: 300
+  working_directory: "tools/eval"
+
+scope:
+  mutable:
+    - "src/search/"
+    - "config/search.yaml"
+  immutable:
+    - "tools/eval/eval_search.py"
+    - "tests/fixtures/"
+    - "docs/"
+
+execution:
+  mode: serial
+  backend: worktree
+  max_concurrent: 1
+
+parallel:
+  port_strategy: none
+  shared_files: []
+
+dependencies:
+  approved: []
+
+constraints:
+  - "Preserve the existing search response shape"
+  - "Do not add new dependencies on the first run"
+
+stopping:
+  max_iterations: 4
+  max_hours: 1
+  plateau_iterations: 3
+  target_reached: true
+
+max_runner_up_merges_per_batch: 0
--- a/plugins/compound-engineering/skills/ce-optimize/references/experiment-log-schema.yaml
+++ b/plugins/compound-engineering/skills/ce-optimize/references/experiment-log-schema.yaml
@@ -0,0 +1,257 @@
+# Experiment Log Schema
+# This is the canonical schema for the experiment log file that accumulates
+# across an optimization run.
+#
+# Location: .context/compound-engineering/ce-optimize/<spec-name>/experiment-log.yaml
+#
+# PERSISTENCE MODEL:
+# The experiment log on disk is the SINGLE SOURCE OF TRUTH. The agent's
+# in-memory context is expendable and will be compacted during long runs.
+#
+# Write discipline:
+# - Each experiment entry is APPENDED immediately after its measurement
+#   completes (SKILL.md step 3.3), before batch evaluation
+# - Outcome fields may be updated in-place after batch evaluation (step 3.5)
+# - The `best` section is updated after each batch if a new best is found
+# - The `hypothesis_backlog` is updated after each batch
+# - The agent re-reads this file from disk at every phase boundary
+#
+# The orchestrator does NOT read the full log each iteration -- it uses a
+# rolling window (last 10 experiments) + a strategy digest file for
+# hypothesis generation. But the full log exists on disk for resume,
+# crash recovery, and post-run analysis.
+
+# ============================================================================
+# TOP-LEVEL STRUCTURE
+# ============================================================================
+
+structure:
+
+  spec:
+    type: string
+    required: true
+    description: "Name of the optimization spec this log belongs to"
+
+  run_id:
+    type: string
+    required: true
+    description: "Unique identifier for this optimization run (timestamp-based). Distinguishes resumed runs from fresh starts."
+
+  started_at:
+    type: string
+    format: "ISO 8601 timestamp"
+    required: true
+
+  baseline:
+    type: object
+    required: true
+    description: "Metrics measured on the original code before any optimization"
+    children:
+      timestamp:
+        type: string
+        format: "ISO 8601 timestamp"
+      gates:
+        type: object
+        description: "Key-value pairs of gate metric names to their baseline values"
+      diagnostics:
+        type: object
+        description: "Key-value pairs of diagnostic metric names to their baseline values"
+      judge:
+        type: object
+        description: "Judge scores on the baseline (only when primary type is 'judge')"
+        children:
+          # All fields from the scoring config appear here
+          # Plus:
+          sample_seed:
+            type: integer
+          judge_cost_usd:
+            type: number
+
+  experiments:
+    type: array
+    required: true
+    description: "Ordered list of all experiments, including kept, reverted, errored, and deferred"
+    items:
+      type: object
+      # See EXPERIMENT ENTRY below
+
+  best:
+    type: object
+    required: true
+    description: "Summary of the current best result"
+    children:
+      iteration:
+        type: integer
+        description: "Iteration number of the best experiment (use 0 for the baseline snapshot before any experiment is kept)"
+      metrics:
+        type: object
+        description: "All metric values from the current best state (seed with baseline metrics during CP-1)"
+      judge:
+        type: object
+        description: "Judge scores from the best experiment (only when primary type is 'judge')"
+      total_judge_cost_usd:
+        type: number
+        description: "Running total of all judge costs across all experiments"
+
+  hypothesis_backlog:
+    type: array
+    description: "Remaining hypotheses not yet tested"
+    items:
+      type: object
+      children:
+        description:
+          type: string
+        category:
+          type: string
+        priority:
+          type: string
+          enum: [high, medium, low]
+        dep_status:
+          type: string
+          enum: [approved, needs_approval, not_applicable]
+        required_deps:
+          type: array
+          items:
+            type: string
+
+# ============================================================================
+# EXPERIMENT ENTRY
+# ============================================================================
+
+experiment_entry:
+  required_children:
+
+    iteration:
+      type: integer
+      description: "Sequential experiment number (1-indexed, monotonically increasing)"
+
+    batch:
+      type: integer
+      description: "Batch number this experiment was part of. Multiple experiments in the same batch ran in parallel."
+
+    hypothesis:
+      type: string
+      description: "Human-readable description of what this experiment tried"
+
+    category:
+      type: string
+      description: "Category for grouping and diversity selection (e.g., signal-extraction, graph-signals, embedding, algorithm, preprocessing)"
+
+    outcome:
+      type: enum
+      values:
+        - measured                # measurement finished and metrics were persisted, awaiting batch evaluation
+        - kept                    # primary metric improved, gates passed -> merged to optimization branch
+        - reverted                # primary metric did not improve or was worse -> changes discarded
+        - degenerate              # degenerate gate failed -> immediately reverted, no judge evaluation
+        - error                   # measurement command crashed, timed out, or produced malformed output
+        - deferred_needs_approval # experiment needs an unapproved dependency -> set aside for batch approval
+        - timeout                 # measurement command exceeded timeout_seconds
+        - runner_up_kept          # file-disjoint runner-up that was cherry-picked and re-measured successfully
+        - runner_up_reverted      # file-disjoint runner-up that was cherry-picked but combined measurement was not better
+      description: >
+        Load-bearing state: the loop branches on this value.
+        'measured' is the only non-terminal state and exists so CP-3 can persist
+        raw metrics before batch-level comparison decides the final outcome.
+        'kept' and 'runner_up_kept' advance the optimization branch.
+        'deferred_needs_approval' items are re-presented at wrap-up.
+        All other states are terminal for that experiment.
+
+  optional_children:
+
+    changes:
+      type: array
+      description: "Files modified by this experiment"
+      items:
+        type: object
+        children:
+          file:
+            type: string
+          summary:
+            type: string
+
+    gates:
+      type: object
+      description: "Gate metric values from the measurement command"
+
+    gates_passed:
+      type: boolean
+      description: "Whether all degenerate gates passed"
+
+    diagnostics:
+      type: object
+      description: "Diagnostic metric values from the measurement command"
+
+    judge:
+      type: object
+      description: "Judge evaluation scores (only when primary type is 'judge' and gates passed)"
+      children:
+        # All fields from scoring.primary and scoring.secondary appear here
+        # Plus:
+        judge_cost_usd:
+          type: number
+          description: "Cost of judge calls for this experiment"
+
+    primary_delta:
+      type: string
+      description: "Change in primary metric from current best (e.g., '+0.7', '-0.3')"
+
+    learnings:
+      type: string
+      description: "What was learned from this experiment. The agent reads these to avoid re-trying similar approaches and to inform new hypothesis generation."
+
+    commit:
+      type: string
+      description: "Git commit SHA on the optimization branch (only for 'kept' and 'runner_up_kept' outcomes)"
+
+    deferred_reason:
+      type: string
+      description: "Why this experiment was deferred (only for 'deferred_needs_approval' outcome)"
+
+    error_message:
+      type: string
+      description: "Error details (only for 'error' and 'timeout' outcomes)"
+
+    merged_with:
+      type: integer
+      description: "Iteration number of the experiment this was merged with (only for 'runner_up_kept' and 'runner_up_reverted')"
+
+# ============================================================================
+# OUTCOME STATE TRANSITIONS
+# ============================================================================
+#
+# proposed (in hypothesis_backlog)
+#   -> selected for batch
+#     -> experiment dispatched
+#       -> measurement completed
+#         -> gates failed           -> outcome: degenerate
+#         -> measurement error      -> outcome: error
+#         -> measurement timeout    -> outcome: timeout
+#         -> gates passed
+#           -> persist raw metrics   -> outcome: measured
+#           -> judge evaluated (if type: judge)
+#             -> best in batch, improved  -> outcome: kept
+#             -> runner-up, file-disjoint -> cherry-pick + re-measure
+#               -> combined better        -> outcome: runner_up_kept
+#               -> combined not better    -> outcome: runner_up_reverted
+#             -> not improved             -> outcome: reverted
+#       -> needs unapproved dep    -> outcome: deferred_needs_approval
+#
+# Only 'kept' and 'runner_up_kept' produce a commit on the optimization branch.
+# Only 'deferred_needs_approval' items are re-presented at wrap-up for approval.
+
+# ============================================================================
+# STRATEGY DIGEST (separate file)
+# ============================================================================
+#
+# Written after each batch to:
+#   .context/compound-engineering/ce-optimize/<spec-name>/strategy-digest.md
+#
+# Contains a compressed summary of:
+# - What hypothesis categories have been tried
+# - Which approaches succeeded (kept) and which failed (reverted)
+# - The exploration frontier: what hasn't been tried yet
+# - Key learnings that should inform next hypotheses
+#
+# The orchestrator reads the strategy digest (not the full experiment log)
+# when generating new hypotheses between batches.
--- a/plugins/compound-engineering/skills/ce-optimize/references/experiment-prompt-template.md
+++ b/plugins/compound-engineering/skills/ce-optimize/references/experiment-prompt-template.md
@@ -0,0 +1,89 @@
+# Experiment Worker Prompt Template
+
+This template is used by the orchestrator to dispatch each experiment to a subagent or Codex. Variable substitution slots are filled at spawn time.
+
+---
+
+## Template
+
+```
+You are an optimization experiment worker.
+
+Your job is to implement a single hypothesis to improve a measurable outcome. You will modify code within a defined scope, then stop. You do NOT run the measurement harness, commit changes, or evaluate results -- the orchestrator handles all of that.
+
+<experiment-context>
+Experiment: #{iteration} for optimization target: {spec_name}
+Hypothesis: {hypothesis_description}
+Category: {hypothesis_category}
+
+Current best metrics:
+{current_best_metrics}
+
+Baseline metrics (before any optimization):
+{baseline_metrics}
+</experiment-context>
+
+<scope-rules>
+You MAY modify files in these paths:
+{scope_mutable}
+
+You MUST NOT modify files in these paths:
+{scope_immutable}
+
+CRITICAL: Do not modify any file outside the mutable scope. The measurement harness and evaluation data are immutable by design -- the agent cannot game the metric by changing how it is measured.
+</scope-rules>
+
+<constraints>
+{constraints}
+</constraints>
+
+<approved-dependencies>
+You may add or use these dependencies without further approval:
+{approved_dependencies}
+
+If your implementation requires a dependency NOT in this list, STOP and note it in your output. Do not install unapproved dependencies.
+</approved-dependencies>
+
+<previous-experiments>
+Recent experiments and their outcomes (for context -- avoid re-trying approaches that already failed):
+
+{recent_experiment_summaries}
+</previous-experiments>
+
+<instructions>
+1. Read and understand the relevant code in the mutable scope
+2. Implement the hypothesis described above
+3. Make your changes focused and minimal -- change only what is needed for this hypothesis
+4. Do NOT run the measurement harness (the orchestrator handles this)
+5. Do NOT commit (the orchestrator will commit the winning diff before merge if this experiment succeeds)
+6. Do NOT modify files outside the mutable scope
+7. When done, run `git diff --stat` so the orchestrator can see your changes
+8. If you discover you need an unapproved dependency, note it and stop
+
+Focus on implementing the hypothesis well. The orchestrator will measure and evaluate the results.
+</instructions>
+```
+
+## Variable Reference
+
+| Variable | Source | Description |
+|----------|--------|-------------|
+| `{iteration}` | Experiment counter | Sequential experiment number |
+| `{spec_name}` | Spec file `name` field | Optimization target identifier |
+| `{hypothesis_description}` | Hypothesis backlog | What this experiment should try |
+| `{hypothesis_category}` | Hypothesis backlog | Category (signal-extraction, algorithm, etc.) |
+| `{current_best_metrics}` | Experiment log `best` section | Current best metric values (compact YAML or key: value pairs) |
+| `{baseline_metrics}` | Experiment log `baseline` section | Original baseline before any optimization |
+| `{scope_mutable}` | Spec `scope.mutable` | List of files/dirs the worker may modify |
+| `{scope_immutable}` | Spec `scope.immutable` | List of files/dirs the worker must not touch |
+| `{constraints}` | Spec `constraints` | Free-text constraints to follow |
+| `{approved_dependencies}` | Spec `dependencies.approved` | Dependencies approved for use |
+| `{recent_experiment_summaries}` | Rolling window (last 10) from experiment log | Compact summaries: hypothesis, outcome, learnings |
+
+## Notes
+
+- This template works for both subagent and Codex dispatch. No platform-specific assumptions.
+- For Codex dispatch: write the filled template to a temp file and pipe via stdin (`cat /tmp/optimize-exp-XXXXX.txt | codex exec --skip-git-repo-check - 2>&1`).
+- For subagent dispatch: pass the filled template as the subagent prompt.
+- Keep `{recent_experiment_summaries}` concise -- 2-3 lines per experiment, last 10 only. Do not include the full experiment log.
+- The worker should NOT read the full experiment log or strategy digest. It receives only what the orchestrator provides.
--- a/plugins/compound-engineering/skills/ce-optimize/references/judge-prompt-template.md
+++ b/plugins/compound-engineering/skills/ce-optimize/references/judge-prompt-template.md
@@ -0,0 +1,110 @@
+# Judge Evaluation Prompt Template
+
+This template is used by the orchestrator to dispatch batched LLM-as-judge evaluation calls. Each judge sub-agent evaluates a batch of sampled output items and returns structured JSON scores.
+
+The orchestrator:
+1. Reads the experiment's output
+2. Selects samples per the stratification config (using fixed seed)
+3. Groups samples into batches of `judge.batch_size`
+4. Dispatches `ceil(sample_size / batch_size)` parallel sub-agents using this template
+5. Aggregates returned JSON scores
+
+---
+
+## Item Evaluation Template
+
+```
+You are a quality judge evaluating output items for an optimization experiment.
+
+Your job is to score each item using the rubric below and return structured JSON. Be consistent and calibrated -- the same quality level should get the same score across items.
+
+<rubric>
+{rubric}
+</rubric>
+
+<items>
+{items_json}
+</items>
+
+<output-contract>
+Return ONLY a valid JSON array. No prose, no markdown, no explanation outside the JSON.
+
+Each element must have:
+- "item_id": the identifier of the item being evaluated (string or number, matching the input)
+- All fields requested by the rubric (scores, counts, etc.)
+- "ambiguous": true if you cannot confidently score this item (e.g., insufficient context, borderline case). When ambiguous, still provide your best-guess score but flag it.
+
+Example output format (adapt field names to match the rubric):
+[
+  {"item_id": "cluster-42", "score": 4, "distinct_topics": 1, "outlier_count": 0, "ambiguous": false},
+  {"item_id": "cluster-17", "score": 2, "distinct_topics": 3, "outlier_count": 2, "ambiguous": false},
+  {"item_id": "cluster-99", "score": 3, "distinct_topics": 2, "outlier_count": 1, "ambiguous": true}
+]
+
+Rules:
+- Evaluate each item independently
+- Score based on the rubric, not on how other items in this batch scored
+- If an item is empty or has only 1 element when it should have more, score it based on what is present
+- For very large items (many elements), focus on a representative subset and note if quality varies across the item
+- Every item in the batch MUST appear in your output
+</output-contract>
+```
+
+## Singleton Evaluation Template
+
+```
+You are a quality judge evaluating singleton items -- items that are currently NOT in any group/cluster.
+
+Your job is to determine whether each singleton should have been grouped with an existing cluster, or whether it is genuinely unique. Return structured JSON.
+
+<rubric>
+{singleton_rubric}
+</rubric>
+
+<singletons>
+{singletons_json}
+</singletons>
+
+<existing-clusters>
+A summary of existing clusters for reference (titles/themes only, not full contents):
+{cluster_summaries}
+</existing-clusters>
+
+<output-contract>
+Return ONLY a valid JSON array. No prose, no markdown, no explanation outside the JSON.
+
+Each element must have:
+- "item_id": the identifier of the singleton
+- All fields requested by the singleton rubric (should_cluster, best_cluster_id, confidence, etc.)
+
+Example output format (adapt field names to match the rubric):
+[
+  {"item_id": "issue-1234", "should_cluster": true, "best_cluster_id": "cluster-42", "confidence": 4},
+  {"item_id": "issue-5678", "should_cluster": false, "best_cluster_id": null, "confidence": 5}
+]
+
+Rules:
+- A singleton that genuinely has no match in existing clusters should get should_cluster: false
+- A singleton that clearly belongs in an existing cluster should get should_cluster: true with the cluster ID
+- High confidence (4-5) means you are very sure. Low confidence (1-2) means the item is borderline.
+- Every singleton in the batch MUST appear in your output
+</output-contract>
+```
+
+## Variable Reference
+
+| Variable | Source | Description |
+|----------|--------|-------------|
+| `{rubric}` | Spec `metric.judge.rubric` | User-defined scoring rubric |
+| `{items_json}` | Sampled output items | JSON array of items to evaluate (one batch worth) |
+| `{singleton_rubric}` | Spec `metric.judge.singleton_rubric` | User-defined rubric for singleton evaluation |
+| `{singletons_json}` | Sampled singleton items | JSON array of singleton items to evaluate |
+| `{cluster_summaries}` | Experiment output | Summary of existing clusters (titles/themes) for singleton reference |
+
+## Notes
+
+- Designed for Haiku by default -- prompts are concise and well-structured for smaller models
+- The rubric is part of the immutable measurement harness -- the experiment agent cannot modify it
+- The `ambiguous` flag on items helps the orchestrator identify noisy evaluations without forcing bad scores
+- For singleton evaluation, the orchestrator provides cluster summaries (not full contents) to keep judge context lean
+- Each sub-agent evaluates one batch independently -- sub-agents do not see each other's results
--- a/plugins/compound-engineering/skills/ce-optimize/references/optimize-spec-schema.yaml
+++ b/plugins/compound-engineering/skills/ce-optimize/references/optimize-spec-schema.yaml
@@ -0,0 +1,392 @@
+# Optimization Spec Schema
+# This is the canonical schema for optimization spec files created by users
+# to configure a /ce-optimize run. The orchestrating agent validates specs
+# against this schema before proceeding.
+#
+# Usage: Create a YAML file matching this schema and pass it to /ce-optimize.
+# The agent reads this spec, validates required fields, and uses it to
+# configure the entire optimization run.
+
+# ============================================================================
+# REQUIRED FIELDS
+# ============================================================================
+
+required_fields:
+
+  name:
+    type: string
+    pattern: "^[a-z0-9]+(?:-[a-z0-9]+)*$"
+    description: "Unique identifier for this optimization run (lowercase kebab-case, safe for git refs and worktree paths)"
+    example: "improve-issue-clustering"
+
+  description:
+    type: string
+    description: "Human-readable description of the optimization goal"
+    example: "Improve coherence and coverage of issue/PR clusters"
+
+  metric:
+    type: object
+    description: "Three-tier metric configuration"
+    required_children:
+
+      primary:
+        type: object
+        description: "The metric the loop optimizes against"
+        required_children:
+
+          type:
+            type: enum
+            values:
+              - hard    # scalar metric from measurement command (e.g., build time, test pass rate)
+              - judge   # LLM-as-judge quality score from sampled outputs
+            description: "Whether the primary metric comes from the measurement command directly or from LLM-as-judge evaluation"
+
+          name:
+            type: string
+            description: "Metric name — must match a key in the measurement command's JSON output (for hard type) or a scoring field (for judge type)"
+            example: "cluster_coherence"
+
+          direction:
+            type: enum
+            values:
+              - maximize
+              - minimize
+            description: "Whether higher or lower is better"
+
+        optional_children:
+
+          baseline:
+            type: number
+            default: null
+            description: "Filled automatically during Phase 1 baseline measurement. Do not set manually."
+
+          target:
+            type: number
+            default: null
+            description: "Optional target value. Loop stops when this is reached."
+            example: 4.2
+
+      degenerate_gates:
+        type: array
+        description: "Fast boolean checks that reject obviously broken solutions before expensive evaluation. Run first, before the primary metric or judge."
+        required: true
+        items:
+          type: object
+          required_children:
+            name:
+              type: string
+              description: "Metric name — must match a key in the measurement command's JSON output"
+            check:
+              type: string
+              description: "Comparison operator and threshold. Supported operators: >=, <=, >, <, ==, !="
+              example: "<= 0.10"
+          optional_children:
+            description:
+              type: string
+              description: "Human-readable explanation of what this gate catches"
+
+    optional_children:
+
+      diagnostics:
+        type: array
+        default: []
+        description: "Metrics logged for understanding but never gated on. Useful for understanding WHY a primary metric changed."
+        items:
+          type: object
+          required_children:
+            name:
+              type: string
+              description: "Metric name — must match a key in the measurement command's JSON output"
+
+      judge:
+        type: object
+        description: "LLM-as-judge configuration. Required when metric.primary.type is 'judge'. Ignored when type is 'hard'."
+        required_when: "metric.primary.type == 'judge'"
+        required_children:
+          rubric:
+            type: string
+            description: "Multi-line rubric text sent to the judge model. Must instruct the judge to return JSON."
+            example: |
+              Rate this cluster 1-5:
+              - 5: All items clearly about the same issue/feature
+              - 4: Strong theme, minor outliers
+              - 3: Related but covers 2-3 sub-topics
+              - 2: Weak connection
+              - 1: Unrelated items grouped together
+          scoring:
+            type: object
+            required_children:
+              primary:
+                type: string
+                description: "Field name from judge JSON output to use as the primary optimization target"
+                example: "mean_score"
+            optional_children:
+              secondary:
+                type: array
+                default: []
+                description: "Additional scoring fields to log (not optimized against)"
+        optional_children:
+          model:
+            type: enum
+            values:
+              - haiku
+              - sonnet
+            default: haiku
+            description: "Model to use for judge evaluation. Haiku is cheaper and faster; Sonnet is more nuanced."
+          sample_size:
+            type: integer
+            default: 10
+            description: "Total number of output items to sample for judge evaluation per experiment"
+          stratification:
+            type: array
+            default: null
+            description: "Stratified sampling buckets. If null, uses uniform random sampling."
+            items:
+              type: object
+              required_children:
+                bucket:
+                  type: string
+                  description: "Bucket name for this stratum"
+                count:
+                  type: integer
+                  description: "Number of items to sample from this bucket"
+          singleton_sample:
+            type: integer
+            default: 0
+            description: "Number of singleton items to sample for false-negative evaluation"
+          singleton_rubric:
+            type: string
+            default: null
+            description: "Rubric for evaluating sampled singletons. Required if singleton_sample > 0."
+          sample_seed:
+            type: integer
+            default: 42
+            description: "Fixed seed for reproducible sampling across experiments"
+          batch_size:
+            type: integer
+            default: 5
+            description: "Number of samples per judge sub-agent batch. Controls parallelism vs overhead."
+          minimum_improvement:
+            type: number
+            default: 0.3
+            description: "Minimum judge score improvement required to accept an experiment as 'better'. Accounts for sample-composition variance when output structure changes between experiments. Distinct from measurement.stability.noise_threshold which handles run-to-run flakiness."
+          max_total_cost_usd:
+            type: number
+            default: 5
+            description: "Stop judge evaluation when cumulative judge spend reaches this cap. This is a first-run safety default; raise it only after the rubric and harness are trustworthy. Set to null only with explicit user approval."
+
+  measurement:
+    type: object
+    description: "How to run the measurement harness"
+    required_children:
+      command:
+        type: string
+        description: "Shell command that runs the evaluation and outputs JSON to stdout. The JSON must contain keys matching all gate names and diagnostic names."
+        example: "python evaluate.py"
+    optional_children:
+      timeout_seconds:
+        type: integer
+        default: 600
+        description: "Maximum seconds for the measurement command to run before being killed"
+      output_format:
+        type: enum
+        values:
+          - json
+        default: json
+        description: "Format of the measurement command's stdout. Currently only JSON is supported."
+      working_directory:
+        type: string
+        default: "."
+        description: "Working directory for the measurement command, relative to the repo root"
+      stability:
+        type: object
+        default: { mode: "stable" }
+        description: "How to handle metric variance across runs"
+        required_children:
+          mode:
+            type: enum
+            values:
+              - stable   # run once, trust the result
+              - repeat   # run N times, aggregate
+            default: stable
+        optional_children:
+          repeat_count:
+            type: integer
+            default: 5
+            description: "Number of times to run the harness when mode is 'repeat'"
+          aggregation:
+            type: enum
+            values:
+              - median
+              - mean
+              - min
+              - max
+            default: median
+            description: "How to combine repeated measurements into a single value"
+          noise_threshold:
+            type: number
+            default: 0.02
+            description: "Minimum improvement that must exceed this value to count as a real improvement (not noise). Applied to hard metrics only."
+
+  scope:
+    type: object
+    description: "What the experiment agent is allowed to modify"
+    required_children:
+      mutable:
+        type: array
+        description: "Files and directories the agent MAY modify during experiments"
+        items:
+          type: string
+          description: "File path or directory (relative to repo root). Directories match all files within."
+        example:
+          - "src/clustering/"
+          - "src/preprocessing/"
+          - "config/clustering.yaml"
+      immutable:
+        type: array
+        description: "Files and directories the agent MUST NOT modify. The measurement harness should always be listed here."
+        items:
+          type: string
+        example:
+          - "evaluate.py"
+          - "tests/fixtures/"
+          - "data/"
+
+# ============================================================================
+# OPTIONAL FIELDS
+# ============================================================================
+
+optional_fields:
+
+  execution:
+    type: object
+    default: { mode: "parallel", backend: "worktree", max_concurrent: 4 }
+    description: "How experiments are executed"
+    optional_children:
+      mode:
+        type: enum
+        values:
+          - parallel  # run experiments simultaneously (default)
+          - serial    # run one at a time
+        default: parallel
+      backend:
+        type: enum
+        values:
+          - worktree  # git worktrees for isolation (default)
+          - codex     # Codex sandboxes for isolation
+        default: worktree
+      max_concurrent:
+        type: integer
+        default: 4
+        minimum: 1
+        description: "Maximum experiments to run in parallel. Capped at 6 for worktree backend. 8+ only valid for Codex backend."
+      codex_security:
+        type: enum
+        values:
+          - full-auto                                # --full-auto (workspace write)
+          - yolo                                     # --dangerously-bypass-approvals-and-sandbox
+        default: null
+        description: "Codex security posture. If null, user is asked once per session."
+
+  parallel:
+    type: object
+    default: {}
+    description: "Parallelism configuration discovered or set during Phase 1"
+    optional_children:
+      port_strategy:
+        type: enum
+        values:
+          - parameterized  # use env var for port
+          - none           # no port parameterization needed
+        default: null
+        description: "If null, auto-detected during Phase 1 parallelism probe"
+      port_env_var:
+        type: string
+        default: null
+        description: "Environment variable name for port parameterization (e.g., EVAL_PORT)"
+      port_base:
+        type: integer
+        default: null
+        description: "Base port number. Each experiment gets port_base + experiment_index."
+      shared_files:
+        type: array
+        default: []
+        description: "Files that must be copied into each experiment worktree (e.g., SQLite databases)"
+        items:
+          type: string
+      exclusive_resources:
+        type: array
+        default: []
+        description: "Resources requiring exclusive access (e.g., 'gpu'). If non-empty, forces serial mode."
+        items:
+          type: string
+
+  dependencies:
+    type: object
+    default: { approved: [] }
+    description: "Dependency management for experiments"
+    optional_children:
+      approved:
+        type: array
+        default: []
+        description: "Pre-approved new dependencies that experiments may add"
+        items:
+          type: string
+
+  constraints:
+    type: array
+    default: []
+    description: "Free-text constraints that experiment agents must follow"
+    items:
+      type: string
+    example:
+      - "Do not change the output format of clusters"
+      - "Preserve backward compatibility with existing cluster consumers"
+
+  stopping:
+    type: object
+    default: { max_iterations: 100, max_hours: 8, plateau_iterations: 10, target_reached: true }
+    description: "When the optimization loop should stop. Any criterion can trigger a stop."
+    optional_children:
+      max_iterations:
+        type: integer
+        default: 100
+        description: "Stop after this many total experiments"
+      max_hours:
+        type: number
+        default: 8
+        description: "Stop after this many hours of wall-clock time"
+      plateau_iterations:
+        type: integer
+        default: 10
+        description: "Stop if no improvement for this many consecutive experiments"
+      target_reached:
+        type: boolean
+        default: true
+        description: "Stop when the primary metric reaches the target value (if set)"
+
+  max_runner_up_merges_per_batch:
+    type: integer
+    default: 1
+    description: "Maximum number of file-disjoint runner-up experiments to attempt merging per batch after keeping the best experiment"
+
+# ============================================================================
+# VALIDATION RULES
+# ============================================================================
+
+validation_rules:
+  - "All required fields must be present"
+  - "name must be lowercase kebab-case (`^[a-z0-9]+(?:-[a-z0-9]+)*$`)"
+  - "metric.primary.type must be 'hard' or 'judge'"
+  - "If metric.primary.type is 'judge', metric.judge must be present with rubric and scoring"
+  - "metric.degenerate_gates must have at least one entry"
+  - "measurement.command must be a non-empty string"
+  - "scope.mutable must have at least one entry"
+  - "scope.immutable must have at least one entry"
+  - "Gate check operators must be one of: >=, <=, >, <, ==, !="
+  - "execution.max_concurrent must be >= 1"
+  - "execution.max_concurrent must not exceed 6 when execution.backend is 'worktree'"
+  - "If parallel.exclusive_resources is non-empty, execution.mode should be 'serial'"
+  - "If metric.judge.singleton_sample > 0, metric.judge.singleton_rubric must be present"
+  - "If metric.primary.type is 'judge' and metric.judge.max_total_cost_usd is null, the user should explicitly approve uncapped spend"
+  - "stopping must have at least one non-default criterion or use defaults"
--- a/plugins/compound-engineering/skills/ce-optimize/references/usage-guide.md
+++ b/plugins/compound-engineering/skills/ce-optimize/references/usage-guide.md
@@ -0,0 +1,127 @@
+# `/ce-optimize` Usage Guide
+
+## What This Skill Is For
+
+`/ce-optimize` is for hard engineering problems where:
+
+1. You can try multiple code or config variants.
+2. You can run the same evaluation against each variant.
+3. You want the skill to keep the good variants and reject the bad ones.
+
+It is best for "search the space and score the results" work, not one-shot implementation work.
+
+## When To Use It
+
+Use `/ce-optimize` when the problem looks like:
+
+- "Find the smallest memory limit that stops OOM crashes without wasting RAM."
+- "Tune clustering parameters without collapsing everything into one garbage cluster."
+- "Find a prompt that is cheaper but still produces summaries good enough for downstream clustering."
+- "Compare several ranking, retrieval, batching, or threshold strategies against the same harness."
+
+Choose `type: hard` when success is objective and cheap to measure:
+
+- Memory usage
+- Latency
+- Throughput
+- Test pass rate
+- Build time
+
+Choose `type: judge` when a numeric metric can be gamed or when human usefulness matters:
+
+- Cluster coherence
+- Search relevance
+- Summary quality
+- Prompt quality
+- Classification quality with semantic edge cases
+
+## When Not To Use It
+
+`/ce-optimize` is usually the wrong tool when:
+
+- The fix is obvious and does not need experimentation
+- There is no repeatable measurement harness
+- The search space is fake and only has one plausible answer
+- The cost of evaluating variants is too high to justify multiple runs
+
+## How To Think About It
+
+The pattern is:
+
+1. Define the target.
+2. Build or validate the measurement harness first.
+3. Generate multiple plausible variants.
+4. Run the same evaluation loop against each variant.
+5. Keep the variants that improve the target without violating guard rails.
+
+The core rule is simple:
+
+- If a hard metric captures "better," optimize the hard metric.
+- If a hard metric can be gamed, add LLM-as-judge.
+
+Example: lowering a clustering threshold may increase cluster coverage. That sounds good until everything ends up in one giant cluster. Hard metrics may say "improved"; an LLM judge sampling real clusters can say "this is trash."
+
+## First-Run Advice
+
+For the first run:
+
+- Prefer `execution.mode: serial`
+- Set `execution.max_concurrent: 1`
+- Keep `stopping.max_iterations` small
+- Keep `stopping.max_hours` small
+- Avoid new dependencies until the baseline is trustworthy
+- In judge mode, use a small sample and a low cost cap
+
+The goal of the first run is to validate the harness, not to win the optimization immediately.
+
+## Example Prompts
+
+### 1. Memory Tuning
+
+```text
+Use /ce-optimize to find the smallest memory setting that keeps this service stable under our load test.
+
+The current container limit is 512 MB and the app sometimes OOM-crashes. Do not just jump to 8 GB. Try a small set of realistic memory limits, run the same load test for each one, and score the results using:
+- did the process OOM
+- did tail latency spike badly
+- did GC pauses become excessive
+
+Prefer the smallest memory limit that passes the guard rails.
+```
+
+### 2. Clustering Quality
+
+```text
+Use /ce-optimize to improve issue and PR clustering quality.
+
+We have about 18k open issues and PRs. We want to test changes that improve clustering quality, reduce singleton clusters, and improve match quality within each cluster.
+
+Do not mutate the shared default database. Copy it for the run, then use per-experiment copies when needed.
+
+Do not optimize only for coverage. Use LLM-as-judge to sample clusters and confirm they still preserve real semantic similarity instead of collapsing into giant low-quality clusters.
+```
+
+### 3. Prompt Optimization
+
+```text
+Use /ce-optimize to create a summarization prompt for issues and PRs that minimizes token spend while still producing summaries that are good enough for downstream clustering.
+
+I want the loop to compare prompt variants, measure token cost, and judge whether the summaries preserve the distinctions needed to cluster related issues together without merging unrelated ones.
+```
+
+## Choosing Between Hard Metrics And Judge Mode
+
+Use hard metrics alone when:
+
+- "Better" is obvious from the numbers.
+
+Add judge mode when:
+
+- The numbers can improve while the real output gets worse.
+
+Common pattern:
+
+- Hard gates reject broken outputs.
+- Judge mode scores the surviving candidates for actual usefulness.
+
+That hybrid setup is often the best default for ranking, clustering, and prompt work.