claude-engineering-plugin/plugins/compound-engineering/skills/ce-optimize/references/optimize-spec-schema.yaml

# Optimization Spec Schema
# This is the canonical schema for optimization spec files created by users
# to configure a /ce-optimize run. The orchestrating agent validates specs
# against this schema before proceeding.
#
# Usage: Create a YAML file matching this schema and pass it to /ce-optimize.
# The agent reads this spec, validates required fields, and uses it to
# configure the entire optimization run.

# ============================================================================
# REQUIRED FIELDS
# ============================================================================

required_fields:

  name:
    type: string
    pattern: "^[a-z0-9]+(?:-[a-z0-9]+)*$"
    description: "Unique identifier for this optimization run (lowercase kebab-case, safe for git refs and worktree paths)"
    example: "improve-issue-clustering"

  description:
    type: string
    description: "Human-readable description of the optimization goal"
    example: "Improve coherence and coverage of issue/PR clusters"

  metric:
    type: object
    description: "Three-tier metric configuration"
    required_children:

      primary:
        type: object
        description: "The metric the loop optimizes against"
        required_children:

          type:
            type: enum
            values:
              - hard    # scalar metric from measurement command (e.g., build time, test pass rate)
              - judge   # LLM-as-judge quality score from sampled outputs
            description: "Whether the primary metric comes from the measurement command directly or from LLM-as-judge evaluation"

          name:
            type: string
            description: "Metric name — must match a key in the measurement command's JSON output (for hard type) or a scoring field (for judge type)"
            example: "cluster_coherence"

          direction:
            type: enum
            values:
              - maximize
              - minimize
            description: "Whether higher or lower is better"

        optional_children:

          baseline:
            type: number
            default: null
            description: "Filled automatically during Phase 1 baseline measurement. Do not set manually."

          target:
            type: number
            default: null
            description: "Optional target value. Loop stops when this is reached."
            example: 4.2

      degenerate_gates:
        type: array
        description: "Fast boolean checks that reject obviously broken solutions before expensive evaluation. Run first, before the primary metric or judge."
        required: true
        items:
          type: object
          required_children:
            name:
              type: string
              description: "Metric name — must match a key in the measurement command's JSON output"
            check:
              type: string
              description: "Comparison operator and threshold. Supported operators: >=, <=, >, <, ==, !="
              example: "<= 0.10"
          optional_children:
            description:
              type: string
              description: "Human-readable explanation of what this gate catches"

    optional_children:

      diagnostics:
        type: array
        default: []
        description: "Metrics logged for understanding but never gated on. Useful for understanding WHY a primary metric changed."
        items:
          type: object
          required_children:
            name:
              type: string
              description: "Metric name — must match a key in the measurement command's JSON output"

      judge:
        type: object
        description: "LLM-as-judge configuration. Required when metric.primary.type is 'judge'. Ignored when type is 'hard'."
        required_when: "metric.primary.type == 'judge'"
        required_children:
          rubric:
            type: string
            description: "Multi-line rubric text sent to the judge model. Must instruct the judge to return JSON."
            example: |
              Rate this cluster 1-5:
              - 5: All items clearly about the same issue/feature
              - 4: Strong theme, minor outliers
              - 3: Related but covers 2-3 sub-topics
              - 2: Weak connection
              - 1: Unrelated items grouped together
          scoring:
            type: object
            required_children:
              primary:
                type: string
                description: "Field name from judge JSON output to use as the primary optimization target"
                example: "mean_score"
            optional_children:
              secondary:
                type: array
                default: []
                description: "Additional scoring fields to log (not optimized against)"
        optional_children:
          model:
            type: enum
            values:
              - haiku
              - sonnet
            default: haiku
            description: "Model to use for judge evaluation. Haiku is cheaper and faster; Sonnet is more nuanced."
          sample_size:
            type: integer
            default: 10
            description: "Total number of output items to sample for judge evaluation per experiment"
          stratification:
            type: array
            default: null
            description: "Stratified sampling buckets. If null, uses uniform random sampling."
            items:
              type: object
              required_children:
                bucket:
                  type: string
                  description: "Bucket name for this stratum"
                count:
                  type: integer
                  description: "Number of items to sample from this bucket"
          singleton_sample:
            type: integer
            default: 0
            description: "Number of singleton items to sample for false-negative evaluation"
          singleton_rubric:
            type: string
            default: null
            description: "Rubric for evaluating sampled singletons. Required if singleton_sample > 0."
          sample_seed:
            type: integer
            default: 42
            description: "Fixed seed for reproducible sampling across experiments"
          batch_size:
            type: integer
            default: 5
            description: "Number of samples per judge sub-agent batch. Controls parallelism vs overhead."
          minimum_improvement:
            type: number
            default: 0.3
            description: "Minimum judge score improvement required to accept an experiment as 'better'. Accounts for sample-composition variance when output structure changes between experiments. Distinct from measurement.stability.noise_threshold which handles run-to-run flakiness."
          max_total_cost_usd:
            type: number
            default: 5
            description: "Stop judge evaluation when cumulative judge spend reaches this cap. This is a first-run safety default; raise it only after the rubric and harness are trustworthy. Set to null only with explicit user approval."

  measurement:
    type: object
    description: "How to run the measurement harness"
    required_children:
      command:
        type: string
        description: "Shell command that runs the evaluation and outputs JSON to stdout. The JSON must contain keys matching all gate names and diagnostic names."
        example: "python evaluate.py"
    optional_children:
      timeout_seconds:
        type: integer
        default: 600
        description: "Maximum seconds for the measurement command to run before being killed"
      output_format:
        type: enum
        values:
          - json
        default: json
        description: "Format of the measurement command's stdout. Currently only JSON is supported."
      working_directory:
        type: string
        default: "."
        description: "Working directory for the measurement command, relative to the repo root"
      stability:
        type: object
        default: { mode: "stable" }
        description: "How to handle metric variance across runs"
        required_children:
          mode:
            type: enum
            values:
              - stable   # run once, trust the result
              - repeat   # run N times, aggregate
            default: stable
        optional_children:
          repeat_count:
            type: integer
            default: 5
            description: "Number of times to run the harness when mode is 'repeat'"
          aggregation:
            type: enum
            values:
              - median
              - mean
              - min
              - max
            default: median
            description: "How to combine repeated measurements into a single value"
          noise_threshold:
            type: number
            default: 0.02
            description: "Minimum improvement that must exceed this value to count as a real improvement (not noise). Applied to hard metrics only."

  scope:
    type: object
    description: "What the experiment agent is allowed to modify"
    required_children:
      mutable:
        type: array
        description: "Files and directories the agent MAY modify during experiments"
        items:
          type: string
          description: "File path or directory (relative to repo root). Directories match all files within."
        example:
          - "src/clustering/"
          - "src/preprocessing/"
          - "config/clustering.yaml"
      immutable:
        type: array
        description: "Files and directories the agent MUST NOT modify. The measurement harness should always be listed here."
        items:
          type: string
        example:
          - "evaluate.py"
          - "tests/fixtures/"
          - "data/"

# ============================================================================
# OPTIONAL FIELDS
# ============================================================================

optional_fields:

  execution:
    type: object
    default: { mode: "parallel", backend: "worktree", max_concurrent: 4 }
    description: "How experiments are executed"
    optional_children:
      mode:
        type: enum
        values:
          - parallel  # run experiments simultaneously (default)
          - serial    # run one at a time
        default: parallel
      backend:
        type: enum
        values:
          - worktree  # git worktrees for isolation (default)
          - codex     # Codex sandboxes for isolation
        default: worktree
      max_concurrent:
        type: integer
        default: 4
        minimum: 1
        description: "Maximum experiments to run in parallel. Capped at 6 for worktree backend. 8+ only valid for Codex backend."
      codex_security:
        type: enum
        values:
          - full-auto                                # --full-auto (workspace write)
          - yolo                                     # --dangerously-bypass-approvals-and-sandbox
        default: null
        description: "Codex security posture. If null, user is asked once per session."

  parallel:
    type: object
    default: {}
    description: "Parallelism configuration discovered or set during Phase 1"
    optional_children:
      port_strategy:
        type: enum
        values:
          - parameterized  # use env var for port
          - none           # no port parameterization needed
        default: null
        description: "If null, auto-detected during Phase 1 parallelism probe"
      port_env_var:
        type: string
        default: null
        description: "Environment variable name for port parameterization (e.g., EVAL_PORT)"
      port_base:
        type: integer
        default: null
        description: "Base port number. Each experiment gets port_base + experiment_index."
      shared_files:
        type: array
        default: []
        description: "Files that must be copied into each experiment worktree (e.g., SQLite databases)"
        items:
          type: string
      exclusive_resources:
        type: array
        default: []
        description: "Resources requiring exclusive access (e.g., 'gpu'). If non-empty, forces serial mode."
        items:
          type: string

  dependencies:
    type: object
    default: { approved: [] }
    description: "Dependency management for experiments"
    optional_children:
      approved:
        type: array
        default: []
        description: "Pre-approved new dependencies that experiments may add"
        items:
          type: string

  constraints:
    type: array
    default: []
    description: "Free-text constraints that experiment agents must follow"
    items:
      type: string
    example:
      - "Do not change the output format of clusters"
      - "Preserve backward compatibility with existing cluster consumers"

  stopping:
    type: object
    default: { max_iterations: 100, max_hours: 8, plateau_iterations: 10, target_reached: true }
    description: "When the optimization loop should stop. Any criterion can trigger a stop."
    optional_children:
      max_iterations:
        type: integer
        default: 100
        description: "Stop after this many total experiments"
      max_hours:
        type: number
        default: 8
        description: "Stop after this many hours of wall-clock time"
      plateau_iterations:
        type: integer
        default: 10
        description: "Stop if no improvement for this many consecutive experiments"
      target_reached:
        type: boolean
        default: true
        description: "Stop when the primary metric reaches the target value (if set)"

  max_runner_up_merges_per_batch:
    type: integer
    default: 1
    description: "Maximum number of file-disjoint runner-up experiments to attempt merging per batch after keeping the best experiment"

# ============================================================================
# VALIDATION RULES
# ============================================================================

validation_rules:
  - "All required fields must be present"
  - "name must be lowercase kebab-case (`^[a-z0-9]+(?:-[a-z0-9]+)*$`)"
  - "metric.primary.type must be 'hard' or 'judge'"
  - "If metric.primary.type is 'judge', metric.judge must be present with rubric and scoring"
  - "metric.degenerate_gates must have at least one entry"
  - "measurement.command must be a non-empty string"
  - "scope.mutable must have at least one entry"
  - "scope.immutable must have at least one entry"
  - "Gate check operators must be one of: >=, <=, >, <, ==, !="
  - "execution.max_concurrent must be >= 1"
  - "execution.max_concurrent must not exceed 6 when execution.backend is 'worktree'"
  - "If parallel.exclusive_resources is non-empty, execution.mode should be 'serial'"
  - "If metric.judge.singleton_sample > 0, metric.judge.singleton_rubric must be present"
  - "If metric.primary.type is 'judge' and metric.judge.max_total_cost_usd is null, the user should explicitly approve uncapped spend"
  - "stopping must have at least one non-default criterion or use defaults"