feat(ce-optimize): Auto-research loop for tuning system prompts / vector clustering / evaluating different code solution / etc (#446)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,392 @@
|
||||
# Optimization Spec Schema
|
||||
# This is the canonical schema for optimization spec files created by users
|
||||
# to configure a /ce-optimize run. The orchestrating agent validates specs
|
||||
# against this schema before proceeding.
|
||||
#
|
||||
# Usage: Create a YAML file matching this schema and pass it to /ce-optimize.
|
||||
# The agent reads this spec, validates required fields, and uses it to
|
||||
# configure the entire optimization run.
|
||||
|
||||
# ============================================================================
|
||||
# REQUIRED FIELDS
|
||||
# ============================================================================
|
||||
|
||||
required_fields:
|
||||
|
||||
name:
|
||||
type: string
|
||||
pattern: "^[a-z0-9]+(?:-[a-z0-9]+)*$"
|
||||
description: "Unique identifier for this optimization run (lowercase kebab-case, safe for git refs and worktree paths)"
|
||||
example: "improve-issue-clustering"
|
||||
|
||||
description:
|
||||
type: string
|
||||
description: "Human-readable description of the optimization goal"
|
||||
example: "Improve coherence and coverage of issue/PR clusters"
|
||||
|
||||
metric:
|
||||
type: object
|
||||
description: "Three-tier metric configuration"
|
||||
required_children:
|
||||
|
||||
primary:
|
||||
type: object
|
||||
description: "The metric the loop optimizes against"
|
||||
required_children:
|
||||
|
||||
type:
|
||||
type: enum
|
||||
values:
|
||||
- hard # scalar metric from measurement command (e.g., build time, test pass rate)
|
||||
- judge # LLM-as-judge quality score from sampled outputs
|
||||
description: "Whether the primary metric comes from the measurement command directly or from LLM-as-judge evaluation"
|
||||
|
||||
name:
|
||||
type: string
|
||||
description: "Metric name — must match a key in the measurement command's JSON output (for hard type) or a scoring field (for judge type)"
|
||||
example: "cluster_coherence"
|
||||
|
||||
direction:
|
||||
type: enum
|
||||
values:
|
||||
- maximize
|
||||
- minimize
|
||||
description: "Whether higher or lower is better"
|
||||
|
||||
optional_children:
|
||||
|
||||
baseline:
|
||||
type: number
|
||||
default: null
|
||||
description: "Filled automatically during Phase 1 baseline measurement. Do not set manually."
|
||||
|
||||
target:
|
||||
type: number
|
||||
default: null
|
||||
description: "Optional target value. Loop stops when this is reached."
|
||||
example: 4.2
|
||||
|
||||
degenerate_gates:
|
||||
type: array
|
||||
description: "Fast boolean checks that reject obviously broken solutions before expensive evaluation. Run first, before the primary metric or judge."
|
||||
required: true
|
||||
items:
|
||||
type: object
|
||||
required_children:
|
||||
name:
|
||||
type: string
|
||||
description: "Metric name — must match a key in the measurement command's JSON output"
|
||||
check:
|
||||
type: string
|
||||
description: "Comparison operator and threshold. Supported operators: >=, <=, >, <, ==, !="
|
||||
example: "<= 0.10"
|
||||
optional_children:
|
||||
description:
|
||||
type: string
|
||||
description: "Human-readable explanation of what this gate catches"
|
||||
|
||||
optional_children:
|
||||
|
||||
diagnostics:
|
||||
type: array
|
||||
default: []
|
||||
description: "Metrics logged for understanding but never gated on. Useful for understanding WHY a primary metric changed."
|
||||
items:
|
||||
type: object
|
||||
required_children:
|
||||
name:
|
||||
type: string
|
||||
description: "Metric name — must match a key in the measurement command's JSON output"
|
||||
|
||||
judge:
|
||||
type: object
|
||||
description: "LLM-as-judge configuration. Required when metric.primary.type is 'judge'. Ignored when type is 'hard'."
|
||||
required_when: "metric.primary.type == 'judge'"
|
||||
required_children:
|
||||
rubric:
|
||||
type: string
|
||||
description: "Multi-line rubric text sent to the judge model. Must instruct the judge to return JSON."
|
||||
example: |
|
||||
Rate this cluster 1-5:
|
||||
- 5: All items clearly about the same issue/feature
|
||||
- 4: Strong theme, minor outliers
|
||||
- 3: Related but covers 2-3 sub-topics
|
||||
- 2: Weak connection
|
||||
- 1: Unrelated items grouped together
|
||||
scoring:
|
||||
type: object
|
||||
required_children:
|
||||
primary:
|
||||
type: string
|
||||
description: "Field name from judge JSON output to use as the primary optimization target"
|
||||
example: "mean_score"
|
||||
optional_children:
|
||||
secondary:
|
||||
type: array
|
||||
default: []
|
||||
description: "Additional scoring fields to log (not optimized against)"
|
||||
optional_children:
|
||||
model:
|
||||
type: enum
|
||||
values:
|
||||
- haiku
|
||||
- sonnet
|
||||
default: haiku
|
||||
description: "Model to use for judge evaluation. Haiku is cheaper and faster; Sonnet is more nuanced."
|
||||
sample_size:
|
||||
type: integer
|
||||
default: 10
|
||||
description: "Total number of output items to sample for judge evaluation per experiment"
|
||||
stratification:
|
||||
type: array
|
||||
default: null
|
||||
description: "Stratified sampling buckets. If null, uses uniform random sampling."
|
||||
items:
|
||||
type: object
|
||||
required_children:
|
||||
bucket:
|
||||
type: string
|
||||
description: "Bucket name for this stratum"
|
||||
count:
|
||||
type: integer
|
||||
description: "Number of items to sample from this bucket"
|
||||
singleton_sample:
|
||||
type: integer
|
||||
default: 0
|
||||
description: "Number of singleton items to sample for false-negative evaluation"
|
||||
singleton_rubric:
|
||||
type: string
|
||||
default: null
|
||||
description: "Rubric for evaluating sampled singletons. Required if singleton_sample > 0."
|
||||
sample_seed:
|
||||
type: integer
|
||||
default: 42
|
||||
description: "Fixed seed for reproducible sampling across experiments"
|
||||
batch_size:
|
||||
type: integer
|
||||
default: 5
|
||||
description: "Number of samples per judge sub-agent batch. Controls parallelism vs overhead."
|
||||
minimum_improvement:
|
||||
type: number
|
||||
default: 0.3
|
||||
description: "Minimum judge score improvement required to accept an experiment as 'better'. Accounts for sample-composition variance when output structure changes between experiments. Distinct from measurement.stability.noise_threshold which handles run-to-run flakiness."
|
||||
max_total_cost_usd:
|
||||
type: number
|
||||
default: 5
|
||||
description: "Stop judge evaluation when cumulative judge spend reaches this cap. This is a first-run safety default; raise it only after the rubric and harness are trustworthy. Set to null only with explicit user approval."
|
||||
|
||||
measurement:
|
||||
type: object
|
||||
description: "How to run the measurement harness"
|
||||
required_children:
|
||||
command:
|
||||
type: string
|
||||
description: "Shell command that runs the evaluation and outputs JSON to stdout. The JSON must contain keys matching all gate names and diagnostic names."
|
||||
example: "python evaluate.py"
|
||||
optional_children:
|
||||
timeout_seconds:
|
||||
type: integer
|
||||
default: 600
|
||||
description: "Maximum seconds for the measurement command to run before being killed"
|
||||
output_format:
|
||||
type: enum
|
||||
values:
|
||||
- json
|
||||
default: json
|
||||
description: "Format of the measurement command's stdout. Currently only JSON is supported."
|
||||
working_directory:
|
||||
type: string
|
||||
default: "."
|
||||
description: "Working directory for the measurement command, relative to the repo root"
|
||||
stability:
|
||||
type: object
|
||||
default: { mode: "stable" }
|
||||
description: "How to handle metric variance across runs"
|
||||
required_children:
|
||||
mode:
|
||||
type: enum
|
||||
values:
|
||||
- stable # run once, trust the result
|
||||
- repeat # run N times, aggregate
|
||||
default: stable
|
||||
optional_children:
|
||||
repeat_count:
|
||||
type: integer
|
||||
default: 5
|
||||
description: "Number of times to run the harness when mode is 'repeat'"
|
||||
aggregation:
|
||||
type: enum
|
||||
values:
|
||||
- median
|
||||
- mean
|
||||
- min
|
||||
- max
|
||||
default: median
|
||||
description: "How to combine repeated measurements into a single value"
|
||||
noise_threshold:
|
||||
type: number
|
||||
default: 0.02
|
||||
description: "Minimum improvement that must exceed this value to count as a real improvement (not noise). Applied to hard metrics only."
|
||||
|
||||
scope:
|
||||
type: object
|
||||
description: "What the experiment agent is allowed to modify"
|
||||
required_children:
|
||||
mutable:
|
||||
type: array
|
||||
description: "Files and directories the agent MAY modify during experiments"
|
||||
items:
|
||||
type: string
|
||||
description: "File path or directory (relative to repo root). Directories match all files within."
|
||||
example:
|
||||
- "src/clustering/"
|
||||
- "src/preprocessing/"
|
||||
- "config/clustering.yaml"
|
||||
immutable:
|
||||
type: array
|
||||
description: "Files and directories the agent MUST NOT modify. The measurement harness should always be listed here."
|
||||
items:
|
||||
type: string
|
||||
example:
|
||||
- "evaluate.py"
|
||||
- "tests/fixtures/"
|
||||
- "data/"
|
||||
|
||||
# ============================================================================
|
||||
# OPTIONAL FIELDS
|
||||
# ============================================================================
|
||||
|
||||
optional_fields:
|
||||
|
||||
execution:
|
||||
type: object
|
||||
default: { mode: "parallel", backend: "worktree", max_concurrent: 4 }
|
||||
description: "How experiments are executed"
|
||||
optional_children:
|
||||
mode:
|
||||
type: enum
|
||||
values:
|
||||
- parallel # run experiments simultaneously (default)
|
||||
- serial # run one at a time
|
||||
default: parallel
|
||||
backend:
|
||||
type: enum
|
||||
values:
|
||||
- worktree # git worktrees for isolation (default)
|
||||
- codex # Codex sandboxes for isolation
|
||||
default: worktree
|
||||
max_concurrent:
|
||||
type: integer
|
||||
default: 4
|
||||
minimum: 1
|
||||
description: "Maximum experiments to run in parallel. Capped at 6 for worktree backend. 8+ only valid for Codex backend."
|
||||
codex_security:
|
||||
type: enum
|
||||
values:
|
||||
- full-auto # --full-auto (workspace write)
|
||||
- yolo # --dangerously-bypass-approvals-and-sandbox
|
||||
default: null
|
||||
description: "Codex security posture. If null, user is asked once per session."
|
||||
|
||||
parallel:
|
||||
type: object
|
||||
default: {}
|
||||
description: "Parallelism configuration discovered or set during Phase 1"
|
||||
optional_children:
|
||||
port_strategy:
|
||||
type: enum
|
||||
values:
|
||||
- parameterized # use env var for port
|
||||
- none # no port parameterization needed
|
||||
default: null
|
||||
description: "If null, auto-detected during Phase 1 parallelism probe"
|
||||
port_env_var:
|
||||
type: string
|
||||
default: null
|
||||
description: "Environment variable name for port parameterization (e.g., EVAL_PORT)"
|
||||
port_base:
|
||||
type: integer
|
||||
default: null
|
||||
description: "Base port number. Each experiment gets port_base + experiment_index."
|
||||
shared_files:
|
||||
type: array
|
||||
default: []
|
||||
description: "Files that must be copied into each experiment worktree (e.g., SQLite databases)"
|
||||
items:
|
||||
type: string
|
||||
exclusive_resources:
|
||||
type: array
|
||||
default: []
|
||||
description: "Resources requiring exclusive access (e.g., 'gpu'). If non-empty, forces serial mode."
|
||||
items:
|
||||
type: string
|
||||
|
||||
dependencies:
|
||||
type: object
|
||||
default: { approved: [] }
|
||||
description: "Dependency management for experiments"
|
||||
optional_children:
|
||||
approved:
|
||||
type: array
|
||||
default: []
|
||||
description: "Pre-approved new dependencies that experiments may add"
|
||||
items:
|
||||
type: string
|
||||
|
||||
constraints:
|
||||
type: array
|
||||
default: []
|
||||
description: "Free-text constraints that experiment agents must follow"
|
||||
items:
|
||||
type: string
|
||||
example:
|
||||
- "Do not change the output format of clusters"
|
||||
- "Preserve backward compatibility with existing cluster consumers"
|
||||
|
||||
stopping:
|
||||
type: object
|
||||
default: { max_iterations: 100, max_hours: 8, plateau_iterations: 10, target_reached: true }
|
||||
description: "When the optimization loop should stop. Any criterion can trigger a stop."
|
||||
optional_children:
|
||||
max_iterations:
|
||||
type: integer
|
||||
default: 100
|
||||
description: "Stop after this many total experiments"
|
||||
max_hours:
|
||||
type: number
|
||||
default: 8
|
||||
description: "Stop after this many hours of wall-clock time"
|
||||
plateau_iterations:
|
||||
type: integer
|
||||
default: 10
|
||||
description: "Stop if no improvement for this many consecutive experiments"
|
||||
target_reached:
|
||||
type: boolean
|
||||
default: true
|
||||
description: "Stop when the primary metric reaches the target value (if set)"
|
||||
|
||||
max_runner_up_merges_per_batch:
|
||||
type: integer
|
||||
default: 1
|
||||
description: "Maximum number of file-disjoint runner-up experiments to attempt merging per batch after keeping the best experiment"
|
||||
|
||||
# ============================================================================
|
||||
# VALIDATION RULES
|
||||
# ============================================================================
|
||||
|
||||
validation_rules:
|
||||
- "All required fields must be present"
|
||||
- "name must be lowercase kebab-case (`^[a-z0-9]+(?:-[a-z0-9]+)*$`)"
|
||||
- "metric.primary.type must be 'hard' or 'judge'"
|
||||
- "If metric.primary.type is 'judge', metric.judge must be present with rubric and scoring"
|
||||
- "metric.degenerate_gates must have at least one entry"
|
||||
- "measurement.command must be a non-empty string"
|
||||
- "scope.mutable must have at least one entry"
|
||||
- "scope.immutable must have at least one entry"
|
||||
- "Gate check operators must be one of: >=, <=, >, <, ==, !="
|
||||
- "execution.max_concurrent must be >= 1"
|
||||
- "execution.max_concurrent must not exceed 6 when execution.backend is 'worktree'"
|
||||
- "If parallel.exclusive_resources is non-empty, execution.mode should be 'serial'"
|
||||
- "If metric.judge.singleton_sample > 0, metric.judge.singleton_rubric must be present"
|
||||
- "If metric.primary.type is 'judge' and metric.judge.max_total_cost_usd is null, the user should explicitly approve uncapped spend"
|
||||
- "stopping must have at least one non-default criterion or use defaults"
|
||||
Reference in New Issue
Block a user