393 lines
15 KiB
YAML
393 lines
15 KiB
YAML
# Optimization Spec Schema
|
|
# This is the canonical schema for optimization spec files created by users
|
|
# to configure a /ce-optimize run. The orchestrating agent validates specs
|
|
# against this schema before proceeding.
|
|
#
|
|
# Usage: Create a YAML file matching this schema and pass it to /ce-optimize.
|
|
# The agent reads this spec, validates required fields, and uses it to
|
|
# configure the entire optimization run.
|
|
|
|
# ============================================================================
|
|
# REQUIRED FIELDS
|
|
# ============================================================================
|
|
|
|
required_fields:
|
|
|
|
name:
|
|
type: string
|
|
pattern: "^[a-z0-9]+(?:-[a-z0-9]+)*$"
|
|
description: "Unique identifier for this optimization run (lowercase kebab-case, safe for git refs and worktree paths)"
|
|
example: "improve-issue-clustering"
|
|
|
|
description:
|
|
type: string
|
|
description: "Human-readable description of the optimization goal"
|
|
example: "Improve coherence and coverage of issue/PR clusters"
|
|
|
|
metric:
|
|
type: object
|
|
description: "Three-tier metric configuration"
|
|
required_children:
|
|
|
|
primary:
|
|
type: object
|
|
description: "The metric the loop optimizes against"
|
|
required_children:
|
|
|
|
type:
|
|
type: enum
|
|
values:
|
|
- hard # scalar metric from measurement command (e.g., build time, test pass rate)
|
|
- judge # LLM-as-judge quality score from sampled outputs
|
|
description: "Whether the primary metric comes from the measurement command directly or from LLM-as-judge evaluation"
|
|
|
|
name:
|
|
type: string
|
|
description: "Metric name — must match a key in the measurement command's JSON output (for hard type) or a scoring field (for judge type)"
|
|
example: "cluster_coherence"
|
|
|
|
direction:
|
|
type: enum
|
|
values:
|
|
- maximize
|
|
- minimize
|
|
description: "Whether higher or lower is better"
|
|
|
|
optional_children:
|
|
|
|
baseline:
|
|
type: number
|
|
default: null
|
|
description: "Filled automatically during Phase 1 baseline measurement. Do not set manually."
|
|
|
|
target:
|
|
type: number
|
|
default: null
|
|
description: "Optional target value. Loop stops when this is reached."
|
|
example: 4.2
|
|
|
|
degenerate_gates:
|
|
type: array
|
|
description: "Fast boolean checks that reject obviously broken solutions before expensive evaluation. Run first, before the primary metric or judge."
|
|
required: true
|
|
items:
|
|
type: object
|
|
required_children:
|
|
name:
|
|
type: string
|
|
description: "Metric name — must match a key in the measurement command's JSON output"
|
|
check:
|
|
type: string
|
|
description: "Comparison operator and threshold. Supported operators: >=, <=, >, <, ==, !="
|
|
example: "<= 0.10"
|
|
optional_children:
|
|
description:
|
|
type: string
|
|
description: "Human-readable explanation of what this gate catches"
|
|
|
|
optional_children:
|
|
|
|
diagnostics:
|
|
type: array
|
|
default: []
|
|
description: "Metrics logged for understanding but never gated on. Useful for understanding WHY a primary metric changed."
|
|
items:
|
|
type: object
|
|
required_children:
|
|
name:
|
|
type: string
|
|
description: "Metric name — must match a key in the measurement command's JSON output"
|
|
|
|
judge:
|
|
type: object
|
|
description: "LLM-as-judge configuration. Required when metric.primary.type is 'judge'. Ignored when type is 'hard'."
|
|
required_when: "metric.primary.type == 'judge'"
|
|
required_children:
|
|
rubric:
|
|
type: string
|
|
description: "Multi-line rubric text sent to the judge model. Must instruct the judge to return JSON."
|
|
example: |
|
|
Rate this cluster 1-5:
|
|
- 5: All items clearly about the same issue/feature
|
|
- 4: Strong theme, minor outliers
|
|
- 3: Related but covers 2-3 sub-topics
|
|
- 2: Weak connection
|
|
- 1: Unrelated items grouped together
|
|
scoring:
|
|
type: object
|
|
required_children:
|
|
primary:
|
|
type: string
|
|
description: "Field name from judge JSON output to use as the primary optimization target"
|
|
example: "mean_score"
|
|
optional_children:
|
|
secondary:
|
|
type: array
|
|
default: []
|
|
description: "Additional scoring fields to log (not optimized against)"
|
|
optional_children:
|
|
model:
|
|
type: enum
|
|
values:
|
|
- haiku
|
|
- sonnet
|
|
default: haiku
|
|
description: "Model to use for judge evaluation. Haiku is cheaper and faster; Sonnet is more nuanced."
|
|
sample_size:
|
|
type: integer
|
|
default: 10
|
|
description: "Total number of output items to sample for judge evaluation per experiment"
|
|
stratification:
|
|
type: array
|
|
default: null
|
|
description: "Stratified sampling buckets. If null, uses uniform random sampling."
|
|
items:
|
|
type: object
|
|
required_children:
|
|
bucket:
|
|
type: string
|
|
description: "Bucket name for this stratum"
|
|
count:
|
|
type: integer
|
|
description: "Number of items to sample from this bucket"
|
|
singleton_sample:
|
|
type: integer
|
|
default: 0
|
|
description: "Number of singleton items to sample for false-negative evaluation"
|
|
singleton_rubric:
|
|
type: string
|
|
default: null
|
|
description: "Rubric for evaluating sampled singletons. Required if singleton_sample > 0."
|
|
sample_seed:
|
|
type: integer
|
|
default: 42
|
|
description: "Fixed seed for reproducible sampling across experiments"
|
|
batch_size:
|
|
type: integer
|
|
default: 5
|
|
description: "Number of samples per judge sub-agent batch. Controls parallelism vs overhead."
|
|
minimum_improvement:
|
|
type: number
|
|
default: 0.3
|
|
description: "Minimum judge score improvement required to accept an experiment as 'better'. Accounts for sample-composition variance when output structure changes between experiments. Distinct from measurement.stability.noise_threshold which handles run-to-run flakiness."
|
|
max_total_cost_usd:
|
|
type: number
|
|
default: 5
|
|
description: "Stop judge evaluation when cumulative judge spend reaches this cap. This is a first-run safety default; raise it only after the rubric and harness are trustworthy. Set to null only with explicit user approval."
|
|
|
|
measurement:
|
|
type: object
|
|
description: "How to run the measurement harness"
|
|
required_children:
|
|
command:
|
|
type: string
|
|
description: "Shell command that runs the evaluation and outputs JSON to stdout. The JSON must contain keys matching all gate names and diagnostic names."
|
|
example: "python evaluate.py"
|
|
optional_children:
|
|
timeout_seconds:
|
|
type: integer
|
|
default: 600
|
|
description: "Maximum seconds for the measurement command to run before being killed"
|
|
output_format:
|
|
type: enum
|
|
values:
|
|
- json
|
|
default: json
|
|
description: "Format of the measurement command's stdout. Currently only JSON is supported."
|
|
working_directory:
|
|
type: string
|
|
default: "."
|
|
description: "Working directory for the measurement command, relative to the repo root"
|
|
stability:
|
|
type: object
|
|
default: { mode: "stable" }
|
|
description: "How to handle metric variance across runs"
|
|
required_children:
|
|
mode:
|
|
type: enum
|
|
values:
|
|
- stable # run once, trust the result
|
|
- repeat # run N times, aggregate
|
|
default: stable
|
|
optional_children:
|
|
repeat_count:
|
|
type: integer
|
|
default: 5
|
|
description: "Number of times to run the harness when mode is 'repeat'"
|
|
aggregation:
|
|
type: enum
|
|
values:
|
|
- median
|
|
- mean
|
|
- min
|
|
- max
|
|
default: median
|
|
description: "How to combine repeated measurements into a single value"
|
|
noise_threshold:
|
|
type: number
|
|
default: 0.02
|
|
description: "Minimum improvement that must exceed this value to count as a real improvement (not noise). Applied to hard metrics only."
|
|
|
|
scope:
|
|
type: object
|
|
description: "What the experiment agent is allowed to modify"
|
|
required_children:
|
|
mutable:
|
|
type: array
|
|
description: "Files and directories the agent MAY modify during experiments"
|
|
items:
|
|
type: string
|
|
description: "File path or directory (relative to repo root). Directories match all files within."
|
|
example:
|
|
- "src/clustering/"
|
|
- "src/preprocessing/"
|
|
- "config/clustering.yaml"
|
|
immutable:
|
|
type: array
|
|
description: "Files and directories the agent MUST NOT modify. The measurement harness should always be listed here."
|
|
items:
|
|
type: string
|
|
example:
|
|
- "evaluate.py"
|
|
- "tests/fixtures/"
|
|
- "data/"
|
|
|
|
# ============================================================================
|
|
# OPTIONAL FIELDS
|
|
# ============================================================================
|
|
|
|
optional_fields:
|
|
|
|
execution:
|
|
type: object
|
|
default: { mode: "parallel", backend: "worktree", max_concurrent: 4 }
|
|
description: "How experiments are executed"
|
|
optional_children:
|
|
mode:
|
|
type: enum
|
|
values:
|
|
- parallel # run experiments simultaneously (default)
|
|
- serial # run one at a time
|
|
default: parallel
|
|
backend:
|
|
type: enum
|
|
values:
|
|
- worktree # git worktrees for isolation (default)
|
|
- codex # Codex sandboxes for isolation
|
|
default: worktree
|
|
max_concurrent:
|
|
type: integer
|
|
default: 4
|
|
minimum: 1
|
|
description: "Maximum experiments to run in parallel. Capped at 6 for worktree backend. 8+ only valid for Codex backend."
|
|
codex_security:
|
|
type: enum
|
|
values:
|
|
- full-auto # --full-auto (workspace write)
|
|
- yolo # --dangerously-bypass-approvals-and-sandbox
|
|
default: null
|
|
description: "Codex security posture. If null, user is asked once per session."
|
|
|
|
parallel:
|
|
type: object
|
|
default: {}
|
|
description: "Parallelism configuration discovered or set during Phase 1"
|
|
optional_children:
|
|
port_strategy:
|
|
type: enum
|
|
values:
|
|
- parameterized # use env var for port
|
|
- none # no port parameterization needed
|
|
default: null
|
|
description: "If null, auto-detected during Phase 1 parallelism probe"
|
|
port_env_var:
|
|
type: string
|
|
default: null
|
|
description: "Environment variable name for port parameterization (e.g., EVAL_PORT)"
|
|
port_base:
|
|
type: integer
|
|
default: null
|
|
description: "Base port number. Each experiment gets port_base + experiment_index."
|
|
shared_files:
|
|
type: array
|
|
default: []
|
|
description: "Files that must be copied into each experiment worktree (e.g., SQLite databases)"
|
|
items:
|
|
type: string
|
|
exclusive_resources:
|
|
type: array
|
|
default: []
|
|
description: "Resources requiring exclusive access (e.g., 'gpu'). If non-empty, forces serial mode."
|
|
items:
|
|
type: string
|
|
|
|
dependencies:
|
|
type: object
|
|
default: { approved: [] }
|
|
description: "Dependency management for experiments"
|
|
optional_children:
|
|
approved:
|
|
type: array
|
|
default: []
|
|
description: "Pre-approved new dependencies that experiments may add"
|
|
items:
|
|
type: string
|
|
|
|
constraints:
|
|
type: array
|
|
default: []
|
|
description: "Free-text constraints that experiment agents must follow"
|
|
items:
|
|
type: string
|
|
example:
|
|
- "Do not change the output format of clusters"
|
|
- "Preserve backward compatibility with existing cluster consumers"
|
|
|
|
stopping:
|
|
type: object
|
|
default: { max_iterations: 100, max_hours: 8, plateau_iterations: 10, target_reached: true }
|
|
description: "When the optimization loop should stop. Any criterion can trigger a stop."
|
|
optional_children:
|
|
max_iterations:
|
|
type: integer
|
|
default: 100
|
|
description: "Stop after this many total experiments"
|
|
max_hours:
|
|
type: number
|
|
default: 8
|
|
description: "Stop after this many hours of wall-clock time"
|
|
plateau_iterations:
|
|
type: integer
|
|
default: 10
|
|
description: "Stop if no improvement for this many consecutive experiments"
|
|
target_reached:
|
|
type: boolean
|
|
default: true
|
|
description: "Stop when the primary metric reaches the target value (if set)"
|
|
|
|
max_runner_up_merges_per_batch:
|
|
type: integer
|
|
default: 1
|
|
description: "Maximum number of file-disjoint runner-up experiments to attempt merging per batch after keeping the best experiment"
|
|
|
|
# ============================================================================
|
|
# VALIDATION RULES
|
|
# ============================================================================
|
|
|
|
validation_rules:
|
|
- "All required fields must be present"
|
|
- "name must be lowercase kebab-case (`^[a-z0-9]+(?:-[a-z0-9]+)*$`)"
|
|
- "metric.primary.type must be 'hard' or 'judge'"
|
|
- "If metric.primary.type is 'judge', metric.judge must be present with rubric and scoring"
|
|
- "metric.degenerate_gates must have at least one entry"
|
|
- "measurement.command must be a non-empty string"
|
|
- "scope.mutable must have at least one entry"
|
|
- "scope.immutable must have at least one entry"
|
|
- "Gate check operators must be one of: >=, <=, >, <, ==, !="
|
|
- "execution.max_concurrent must be >= 1"
|
|
- "execution.max_concurrent must not exceed 6 when execution.backend is 'worktree'"
|
|
- "If parallel.exclusive_resources is non-empty, execution.mode should be 'serial'"
|
|
- "If metric.judge.singleton_sample > 0, metric.judge.singleton_rubric must be present"
|
|
- "If metric.primary.type is 'judge' and metric.judge.max_total_cost_usd is null, the user should explicitly approve uncapped spend"
|
|
- "stopping must have at least one non-default criterion or use defaults"
|