feat(ce-optimize): Auto-research loop for tuning system prompts / vector clustering / evaluating different code solution / etc (#446)

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 23:16:09 -04:00
parent 4e0ed2cc8d
commit 8f20aa0406
15 changed files with 3970 additions and 1 deletions
--- a/plugins/compound-engineering/skills/ce-optimize/references/optimize-spec-schema.yaml
+++ b/plugins/compound-engineering/skills/ce-optimize/references/optimize-spec-schema.yaml
@@ -0,0 +1,392 @@
+# Optimization Spec Schema
+# This is the canonical schema for optimization spec files created by users
+# to configure a /ce-optimize run. The orchestrating agent validates specs
+# against this schema before proceeding.
+#
+# Usage: Create a YAML file matching this schema and pass it to /ce-optimize.
+# The agent reads this spec, validates required fields, and uses it to
+# configure the entire optimization run.
+
+# ============================================================================
+# REQUIRED FIELDS
+# ============================================================================
+
+required_fields:
+
+  name:
+    type: string
+    pattern: "^[a-z0-9]+(?:-[a-z0-9]+)*$"
+    description: "Unique identifier for this optimization run (lowercase kebab-case, safe for git refs and worktree paths)"
+    example: "improve-issue-clustering"
+
+  description:
+    type: string
+    description: "Human-readable description of the optimization goal"
+    example: "Improve coherence and coverage of issue/PR clusters"
+
+  metric:
+    type: object
+    description: "Three-tier metric configuration"
+    required_children:
+
+      primary:
+        type: object
+        description: "The metric the loop optimizes against"
+        required_children:
+
+          type:
+            type: enum
+            values:
+              - hard    # scalar metric from measurement command (e.g., build time, test pass rate)
+              - judge   # LLM-as-judge quality score from sampled outputs
+            description: "Whether the primary metric comes from the measurement command directly or from LLM-as-judge evaluation"
+
+          name:
+            type: string
+            description: "Metric name — must match a key in the measurement command's JSON output (for hard type) or a scoring field (for judge type)"
+            example: "cluster_coherence"
+
+          direction:
+            type: enum
+            values:
+              - maximize
+              - minimize
+            description: "Whether higher or lower is better"
+
+        optional_children:
+
+          baseline:
+            type: number
+            default: null
+            description: "Filled automatically during Phase 1 baseline measurement. Do not set manually."
+
+          target:
+            type: number
+            default: null
+            description: "Optional target value. Loop stops when this is reached."
+            example: 4.2
+
+      degenerate_gates:
+        type: array
+        description: "Fast boolean checks that reject obviously broken solutions before expensive evaluation. Run first, before the primary metric or judge."
+        required: true
+        items:
+          type: object
+          required_children:
+            name:
+              type: string
+              description: "Metric name — must match a key in the measurement command's JSON output"
+            check:
+              type: string
+              description: "Comparison operator and threshold. Supported operators: >=, <=, >, <, ==, !="
+              example: "<= 0.10"
+          optional_children:
+            description:
+              type: string
+              description: "Human-readable explanation of what this gate catches"
+
+    optional_children:
+
+      diagnostics:
+        type: array
+        default: []
+        description: "Metrics logged for understanding but never gated on. Useful for understanding WHY a primary metric changed."
+        items:
+          type: object
+          required_children:
+            name:
+              type: string
+              description: "Metric name — must match a key in the measurement command's JSON output"
+
+      judge:
+        type: object
+        description: "LLM-as-judge configuration. Required when metric.primary.type is 'judge'. Ignored when type is 'hard'."
+        required_when: "metric.primary.type == 'judge'"
+        required_children:
+          rubric:
+            type: string
+            description: "Multi-line rubric text sent to the judge model. Must instruct the judge to return JSON."
+            example: |
+              Rate this cluster 1-5:
+              - 5: All items clearly about the same issue/feature
+              - 4: Strong theme, minor outliers
+              - 3: Related but covers 2-3 sub-topics
+              - 2: Weak connection
+              - 1: Unrelated items grouped together
+          scoring:
+            type: object
+            required_children:
+              primary:
+                type: string
+                description: "Field name from judge JSON output to use as the primary optimization target"
+                example: "mean_score"
+            optional_children:
+              secondary:
+                type: array
+                default: []
+                description: "Additional scoring fields to log (not optimized against)"
+        optional_children:
+          model:
+            type: enum
+            values:
+              - haiku
+              - sonnet
+            default: haiku
+            description: "Model to use for judge evaluation. Haiku is cheaper and faster; Sonnet is more nuanced."
+          sample_size:
+            type: integer
+            default: 10
+            description: "Total number of output items to sample for judge evaluation per experiment"
+          stratification:
+            type: array
+            default: null
+            description: "Stratified sampling buckets. If null, uses uniform random sampling."
+            items:
+              type: object
+              required_children:
+                bucket:
+                  type: string
+                  description: "Bucket name for this stratum"
+                count:
+                  type: integer
+                  description: "Number of items to sample from this bucket"
+          singleton_sample:
+            type: integer
+            default: 0
+            description: "Number of singleton items to sample for false-negative evaluation"
+          singleton_rubric:
+            type: string
+            default: null
+            description: "Rubric for evaluating sampled singletons. Required if singleton_sample > 0."
+          sample_seed:
+            type: integer
+            default: 42
+            description: "Fixed seed for reproducible sampling across experiments"
+          batch_size:
+            type: integer
+            default: 5
+            description: "Number of samples per judge sub-agent batch. Controls parallelism vs overhead."
+          minimum_improvement:
+            type: number
+            default: 0.3
+            description: "Minimum judge score improvement required to accept an experiment as 'better'. Accounts for sample-composition variance when output structure changes between experiments. Distinct from measurement.stability.noise_threshold which handles run-to-run flakiness."
+          max_total_cost_usd:
+            type: number
+            default: 5
+            description: "Stop judge evaluation when cumulative judge spend reaches this cap. This is a first-run safety default; raise it only after the rubric and harness are trustworthy. Set to null only with explicit user approval."
+
+  measurement:
+    type: object
+    description: "How to run the measurement harness"
+    required_children:
+      command:
+        type: string
+        description: "Shell command that runs the evaluation and outputs JSON to stdout. The JSON must contain keys matching all gate names and diagnostic names."
+        example: "python evaluate.py"
+    optional_children:
+      timeout_seconds:
+        type: integer
+        default: 600
+        description: "Maximum seconds for the measurement command to run before being killed"
+      output_format:
+        type: enum
+        values:
+          - json
+        default: json
+        description: "Format of the measurement command's stdout. Currently only JSON is supported."
+      working_directory:
+        type: string
+        default: "."
+        description: "Working directory for the measurement command, relative to the repo root"
+      stability:
+        type: object
+        default: { mode: "stable" }
+        description: "How to handle metric variance across runs"
+        required_children:
+          mode:
+            type: enum
+            values:
+              - stable   # run once, trust the result
+              - repeat   # run N times, aggregate
+            default: stable
+        optional_children:
+          repeat_count:
+            type: integer
+            default: 5
+            description: "Number of times to run the harness when mode is 'repeat'"
+          aggregation:
+            type: enum
+            values:
+              - median
+              - mean
+              - min
+              - max
+            default: median
+            description: "How to combine repeated measurements into a single value"
+          noise_threshold:
+            type: number
+            default: 0.02
+            description: "Minimum improvement that must exceed this value to count as a real improvement (not noise). Applied to hard metrics only."
+
+  scope:
+    type: object
+    description: "What the experiment agent is allowed to modify"
+    required_children:
+      mutable:
+        type: array
+        description: "Files and directories the agent MAY modify during experiments"
+        items:
+          type: string
+          description: "File path or directory (relative to repo root). Directories match all files within."
+        example:
+          - "src/clustering/"
+          - "src/preprocessing/"
+          - "config/clustering.yaml"
+      immutable:
+        type: array
+        description: "Files and directories the agent MUST NOT modify. The measurement harness should always be listed here."
+        items:
+          type: string
+        example:
+          - "evaluate.py"
+          - "tests/fixtures/"
+          - "data/"
+
+# ============================================================================
+# OPTIONAL FIELDS
+# ============================================================================
+
+optional_fields:
+
+  execution:
+    type: object
+    default: { mode: "parallel", backend: "worktree", max_concurrent: 4 }
+    description: "How experiments are executed"
+    optional_children:
+      mode:
+        type: enum
+        values:
+          - parallel  # run experiments simultaneously (default)
+          - serial    # run one at a time
+        default: parallel
+      backend:
+        type: enum
+        values:
+          - worktree  # git worktrees for isolation (default)
+          - codex     # Codex sandboxes for isolation
+        default: worktree
+      max_concurrent:
+        type: integer
+        default: 4
+        minimum: 1
+        description: "Maximum experiments to run in parallel. Capped at 6 for worktree backend. 8+ only valid for Codex backend."
+      codex_security:
+        type: enum
+        values:
+          - full-auto                                # --full-auto (workspace write)
+          - yolo                                     # --dangerously-bypass-approvals-and-sandbox
+        default: null
+        description: "Codex security posture. If null, user is asked once per session."
+
+  parallel:
+    type: object
+    default: {}
+    description: "Parallelism configuration discovered or set during Phase 1"
+    optional_children:
+      port_strategy:
+        type: enum
+        values:
+          - parameterized  # use env var for port
+          - none           # no port parameterization needed
+        default: null
+        description: "If null, auto-detected during Phase 1 parallelism probe"
+      port_env_var:
+        type: string
+        default: null
+        description: "Environment variable name for port parameterization (e.g., EVAL_PORT)"
+      port_base:
+        type: integer
+        default: null
+        description: "Base port number. Each experiment gets port_base + experiment_index."
+      shared_files:
+        type: array
+        default: []
+        description: "Files that must be copied into each experiment worktree (e.g., SQLite databases)"
+        items:
+          type: string
+      exclusive_resources:
+        type: array
+        default: []
+        description: "Resources requiring exclusive access (e.g., 'gpu'). If non-empty, forces serial mode."
+        items:
+          type: string
+
+  dependencies:
+    type: object
+    default: { approved: [] }
+    description: "Dependency management for experiments"
+    optional_children:
+      approved:
+        type: array
+        default: []
+        description: "Pre-approved new dependencies that experiments may add"
+        items:
+          type: string
+
+  constraints:
+    type: array
+    default: []
+    description: "Free-text constraints that experiment agents must follow"
+    items:
+      type: string
+    example:
+      - "Do not change the output format of clusters"
+      - "Preserve backward compatibility with existing cluster consumers"
+
+  stopping:
+    type: object
+    default: { max_iterations: 100, max_hours: 8, plateau_iterations: 10, target_reached: true }
+    description: "When the optimization loop should stop. Any criterion can trigger a stop."
+    optional_children:
+      max_iterations:
+        type: integer
+        default: 100
+        description: "Stop after this many total experiments"
+      max_hours:
+        type: number
+        default: 8
+        description: "Stop after this many hours of wall-clock time"
+      plateau_iterations:
+        type: integer
+        default: 10
+        description: "Stop if no improvement for this many consecutive experiments"
+      target_reached:
+        type: boolean
+        default: true
+        description: "Stop when the primary metric reaches the target value (if set)"
+
+  max_runner_up_merges_per_batch:
+    type: integer
+    default: 1
+    description: "Maximum number of file-disjoint runner-up experiments to attempt merging per batch after keeping the best experiment"
+
+# ============================================================================
+# VALIDATION RULES
+# ============================================================================
+
+validation_rules:
+  - "All required fields must be present"
+  - "name must be lowercase kebab-case (`^[a-z0-9]+(?:-[a-z0-9]+)*$`)"
+  - "metric.primary.type must be 'hard' or 'judge'"
+  - "If metric.primary.type is 'judge', metric.judge must be present with rubric and scoring"
+  - "metric.degenerate_gates must have at least one entry"
+  - "measurement.command must be a non-empty string"
+  - "scope.mutable must have at least one entry"
+  - "scope.immutable must have at least one entry"
+  - "Gate check operators must be one of: >=, <=, >, <, ==, !="
+  - "execution.max_concurrent must be >= 1"
+  - "execution.max_concurrent must not exceed 6 when execution.backend is 'worktree'"
+  - "If parallel.exclusive_resources is non-empty, execution.mode should be 'serial'"
+  - "If metric.judge.singleton_sample > 0, metric.judge.singleton_rubric must be present"
+  - "If metric.primary.type is 'judge' and metric.judge.max_total_cost_usd is null, the user should explicitly approve uncapped spend"
+  - "stopping must have at least one non-default criterion or use defaults"