# Optimization Spec Schema # This is the canonical schema for optimization spec files created by users # to configure a /ce-optimize run. The orchestrating agent validates specs # against this schema before proceeding. # # Usage: Create a YAML file matching this schema and pass it to /ce-optimize. # The agent reads this spec, validates required fields, and uses it to # configure the entire optimization run. # ============================================================================ # REQUIRED FIELDS # ============================================================================ required_fields: name: type: string pattern: "^[a-z0-9]+(?:-[a-z0-9]+)*$" description: "Unique identifier for this optimization run (lowercase kebab-case, safe for git refs and worktree paths)" example: "improve-issue-clustering" description: type: string description: "Human-readable description of the optimization goal" example: "Improve coherence and coverage of issue/PR clusters" metric: type: object description: "Three-tier metric configuration" required_children: primary: type: object description: "The metric the loop optimizes against" required_children: type: type: enum values: - hard # scalar metric from measurement command (e.g., build time, test pass rate) - judge # LLM-as-judge quality score from sampled outputs description: "Whether the primary metric comes from the measurement command directly or from LLM-as-judge evaluation" name: type: string description: "Metric name — must match a key in the measurement command's JSON output (for hard type) or a scoring field (for judge type)" example: "cluster_coherence" direction: type: enum values: - maximize - minimize description: "Whether higher or lower is better" optional_children: baseline: type: number default: null description: "Filled automatically during Phase 1 baseline measurement. Do not set manually." target: type: number default: null description: "Optional target value. Loop stops when this is reached." example: 4.2 degenerate_gates: type: array description: "Fast boolean checks that reject obviously broken solutions before expensive evaluation. Run first, before the primary metric or judge." required: true items: type: object required_children: name: type: string description: "Metric name — must match a key in the measurement command's JSON output" check: type: string description: "Comparison operator and threshold. Supported operators: >=, <=, >, <, ==, !=" example: "<= 0.10" optional_children: description: type: string description: "Human-readable explanation of what this gate catches" optional_children: diagnostics: type: array default: [] description: "Metrics logged for understanding but never gated on. Useful for understanding WHY a primary metric changed." items: type: object required_children: name: type: string description: "Metric name — must match a key in the measurement command's JSON output" judge: type: object description: "LLM-as-judge configuration. Required when metric.primary.type is 'judge'. Ignored when type is 'hard'." required_when: "metric.primary.type == 'judge'" required_children: rubric: type: string description: "Multi-line rubric text sent to the judge model. Must instruct the judge to return JSON." example: | Rate this cluster 1-5: - 5: All items clearly about the same issue/feature - 4: Strong theme, minor outliers - 3: Related but covers 2-3 sub-topics - 2: Weak connection - 1: Unrelated items grouped together scoring: type: object required_children: primary: type: string description: "Field name from judge JSON output to use as the primary optimization target" example: "mean_score" optional_children: secondary: type: array default: [] description: "Additional scoring fields to log (not optimized against)" optional_children: model: type: enum values: - haiku - sonnet default: haiku description: "Model to use for judge evaluation. Haiku is cheaper and faster; Sonnet is more nuanced." sample_size: type: integer default: 10 description: "Total number of output items to sample for judge evaluation per experiment" stratification: type: array default: null description: "Stratified sampling buckets. If null, uses uniform random sampling." items: type: object required_children: bucket: type: string description: "Bucket name for this stratum" count: type: integer description: "Number of items to sample from this bucket" singleton_sample: type: integer default: 0 description: "Number of singleton items to sample for false-negative evaluation" singleton_rubric: type: string default: null description: "Rubric for evaluating sampled singletons. Required if singleton_sample > 0." sample_seed: type: integer default: 42 description: "Fixed seed for reproducible sampling across experiments" batch_size: type: integer default: 5 description: "Number of samples per judge sub-agent batch. Controls parallelism vs overhead." minimum_improvement: type: number default: 0.3 description: "Minimum judge score improvement required to accept an experiment as 'better'. Accounts for sample-composition variance when output structure changes between experiments. Distinct from measurement.stability.noise_threshold which handles run-to-run flakiness." max_total_cost_usd: type: number default: 5 description: "Stop judge evaluation when cumulative judge spend reaches this cap. This is a first-run safety default; raise it only after the rubric and harness are trustworthy. Set to null only with explicit user approval." measurement: type: object description: "How to run the measurement harness" required_children: command: type: string description: "Shell command that runs the evaluation and outputs JSON to stdout. The JSON must contain keys matching all gate names and diagnostic names." example: "python evaluate.py" optional_children: timeout_seconds: type: integer default: 600 description: "Maximum seconds for the measurement command to run before being killed" output_format: type: enum values: - json default: json description: "Format of the measurement command's stdout. Currently only JSON is supported." working_directory: type: string default: "." description: "Working directory for the measurement command, relative to the repo root" stability: type: object default: { mode: "stable" } description: "How to handle metric variance across runs" required_children: mode: type: enum values: - stable # run once, trust the result - repeat # run N times, aggregate default: stable optional_children: repeat_count: type: integer default: 5 description: "Number of times to run the harness when mode is 'repeat'" aggregation: type: enum values: - median - mean - min - max default: median description: "How to combine repeated measurements into a single value" noise_threshold: type: number default: 0.02 description: "Minimum improvement that must exceed this value to count as a real improvement (not noise). Applied to hard metrics only." scope: type: object description: "What the experiment agent is allowed to modify" required_children: mutable: type: array description: "Files and directories the agent MAY modify during experiments" items: type: string description: "File path or directory (relative to repo root). Directories match all files within." example: - "src/clustering/" - "src/preprocessing/" - "config/clustering.yaml" immutable: type: array description: "Files and directories the agent MUST NOT modify. The measurement harness should always be listed here." items: type: string example: - "evaluate.py" - "tests/fixtures/" - "data/" # ============================================================================ # OPTIONAL FIELDS # ============================================================================ optional_fields: execution: type: object default: { mode: "parallel", backend: "worktree", max_concurrent: 4 } description: "How experiments are executed" optional_children: mode: type: enum values: - parallel # run experiments simultaneously (default) - serial # run one at a time default: parallel backend: type: enum values: - worktree # git worktrees for isolation (default) - codex # Codex sandboxes for isolation default: worktree max_concurrent: type: integer default: 4 minimum: 1 description: "Maximum experiments to run in parallel. Capped at 6 for worktree backend. 8+ only valid for Codex backend." codex_security: type: enum values: - full-auto # --full-auto (workspace write) - yolo # --dangerously-bypass-approvals-and-sandbox default: null description: "Codex security posture. If null, user is asked once per session." parallel: type: object default: {} description: "Parallelism configuration discovered or set during Phase 1" optional_children: port_strategy: type: enum values: - parameterized # use env var for port - none # no port parameterization needed default: null description: "If null, auto-detected during Phase 1 parallelism probe" port_env_var: type: string default: null description: "Environment variable name for port parameterization (e.g., EVAL_PORT)" port_base: type: integer default: null description: "Base port number. Each experiment gets port_base + experiment_index." shared_files: type: array default: [] description: "Files that must be copied into each experiment worktree (e.g., SQLite databases)" items: type: string exclusive_resources: type: array default: [] description: "Resources requiring exclusive access (e.g., 'gpu'). If non-empty, forces serial mode." items: type: string dependencies: type: object default: { approved: [] } description: "Dependency management for experiments" optional_children: approved: type: array default: [] description: "Pre-approved new dependencies that experiments may add" items: type: string constraints: type: array default: [] description: "Free-text constraints that experiment agents must follow" items: type: string example: - "Do not change the output format of clusters" - "Preserve backward compatibility with existing cluster consumers" stopping: type: object default: { max_iterations: 100, max_hours: 8, plateau_iterations: 10, target_reached: true } description: "When the optimization loop should stop. Any criterion can trigger a stop." optional_children: max_iterations: type: integer default: 100 description: "Stop after this many total experiments" max_hours: type: number default: 8 description: "Stop after this many hours of wall-clock time" plateau_iterations: type: integer default: 10 description: "Stop if no improvement for this many consecutive experiments" target_reached: type: boolean default: true description: "Stop when the primary metric reaches the target value (if set)" max_runner_up_merges_per_batch: type: integer default: 1 description: "Maximum number of file-disjoint runner-up experiments to attempt merging per batch after keeping the best experiment" # ============================================================================ # VALIDATION RULES # ============================================================================ validation_rules: - "All required fields must be present" - "name must be lowercase kebab-case (`^[a-z0-9]+(?:-[a-z0-9]+)*$`)" - "metric.primary.type must be 'hard' or 'judge'" - "If metric.primary.type is 'judge', metric.judge must be present with rubric and scoring" - "metric.degenerate_gates must have at least one entry" - "measurement.command must be a non-empty string" - "scope.mutable must have at least one entry" - "scope.immutable must have at least one entry" - "Gate check operators must be one of: >=, <=, >, <, ==, !=" - "execution.max_concurrent must be >= 1" - "execution.max_concurrent must not exceed 6 when execution.backend is 'worktree'" - "If parallel.exclusive_resources is non-empty, execution.mode should be 'serial'" - "If metric.judge.singleton_sample > 0, metric.judge.singleton_rubric must be present" - "If metric.primary.type is 'judge' and metric.judge.max_total_cost_usd is null, the user should explicitly approve uncapped spend" - "stopping must have at least one non-default criterion or use defaults"