feat(ce-optimize): Auto-research loop for tuning system prompts / vector clustering / evaluating different code solution / etc (#446)

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 23:16:09 -04:00
parent 4e0ed2cc8d
commit 8f20aa0406
15 changed files with 3970 additions and 1 deletions
--- a/plugins/compound-engineering/skills/ce-optimize/references/experiment-log-schema.yaml
+++ b/plugins/compound-engineering/skills/ce-optimize/references/experiment-log-schema.yaml
@@ -0,0 +1,257 @@
+# Experiment Log Schema
+# This is the canonical schema for the experiment log file that accumulates
+# across an optimization run.
+#
+# Location: .context/compound-engineering/ce-optimize/<spec-name>/experiment-log.yaml
+#
+# PERSISTENCE MODEL:
+# The experiment log on disk is the SINGLE SOURCE OF TRUTH. The agent's
+# in-memory context is expendable and will be compacted during long runs.
+#
+# Write discipline:
+# - Each experiment entry is APPENDED immediately after its measurement
+#   completes (SKILL.md step 3.3), before batch evaluation
+# - Outcome fields may be updated in-place after batch evaluation (step 3.5)
+# - The `best` section is updated after each batch if a new best is found
+# - The `hypothesis_backlog` is updated after each batch
+# - The agent re-reads this file from disk at every phase boundary
+#
+# The orchestrator does NOT read the full log each iteration -- it uses a
+# rolling window (last 10 experiments) + a strategy digest file for
+# hypothesis generation. But the full log exists on disk for resume,
+# crash recovery, and post-run analysis.
+
+# ============================================================================
+# TOP-LEVEL STRUCTURE
+# ============================================================================
+
+structure:
+
+  spec:
+    type: string
+    required: true
+    description: "Name of the optimization spec this log belongs to"
+
+  run_id:
+    type: string
+    required: true
+    description: "Unique identifier for this optimization run (timestamp-based). Distinguishes resumed runs from fresh starts."
+
+  started_at:
+    type: string
+    format: "ISO 8601 timestamp"
+    required: true
+
+  baseline:
+    type: object
+    required: true
+    description: "Metrics measured on the original code before any optimization"
+    children:
+      timestamp:
+        type: string
+        format: "ISO 8601 timestamp"
+      gates:
+        type: object
+        description: "Key-value pairs of gate metric names to their baseline values"
+      diagnostics:
+        type: object
+        description: "Key-value pairs of diagnostic metric names to their baseline values"
+      judge:
+        type: object
+        description: "Judge scores on the baseline (only when primary type is 'judge')"
+        children:
+          # All fields from the scoring config appear here
+          # Plus:
+          sample_seed:
+            type: integer
+          judge_cost_usd:
+            type: number
+
+  experiments:
+    type: array
+    required: true
+    description: "Ordered list of all experiments, including kept, reverted, errored, and deferred"
+    items:
+      type: object
+      # See EXPERIMENT ENTRY below
+
+  best:
+    type: object
+    required: true
+    description: "Summary of the current best result"
+    children:
+      iteration:
+        type: integer
+        description: "Iteration number of the best experiment (use 0 for the baseline snapshot before any experiment is kept)"
+      metrics:
+        type: object
+        description: "All metric values from the current best state (seed with baseline metrics during CP-1)"
+      judge:
+        type: object
+        description: "Judge scores from the best experiment (only when primary type is 'judge')"
+      total_judge_cost_usd:
+        type: number
+        description: "Running total of all judge costs across all experiments"
+
+  hypothesis_backlog:
+    type: array
+    description: "Remaining hypotheses not yet tested"
+    items:
+      type: object
+      children:
+        description:
+          type: string
+        category:
+          type: string
+        priority:
+          type: string
+          enum: [high, medium, low]
+        dep_status:
+          type: string
+          enum: [approved, needs_approval, not_applicable]
+        required_deps:
+          type: array
+          items:
+            type: string
+
+# ============================================================================
+# EXPERIMENT ENTRY
+# ============================================================================
+
+experiment_entry:
+  required_children:
+
+    iteration:
+      type: integer
+      description: "Sequential experiment number (1-indexed, monotonically increasing)"
+
+    batch:
+      type: integer
+      description: "Batch number this experiment was part of. Multiple experiments in the same batch ran in parallel."
+
+    hypothesis:
+      type: string
+      description: "Human-readable description of what this experiment tried"
+
+    category:
+      type: string
+      description: "Category for grouping and diversity selection (e.g., signal-extraction, graph-signals, embedding, algorithm, preprocessing)"
+
+    outcome:
+      type: enum
+      values:
+        - measured                # measurement finished and metrics were persisted, awaiting batch evaluation
+        - kept                    # primary metric improved, gates passed -> merged to optimization branch
+        - reverted                # primary metric did not improve or was worse -> changes discarded
+        - degenerate              # degenerate gate failed -> immediately reverted, no judge evaluation
+        - error                   # measurement command crashed, timed out, or produced malformed output
+        - deferred_needs_approval # experiment needs an unapproved dependency -> set aside for batch approval
+        - timeout                 # measurement command exceeded timeout_seconds
+        - runner_up_kept          # file-disjoint runner-up that was cherry-picked and re-measured successfully
+        - runner_up_reverted      # file-disjoint runner-up that was cherry-picked but combined measurement was not better
+      description: >
+        Load-bearing state: the loop branches on this value.
+        'measured' is the only non-terminal state and exists so CP-3 can persist
+        raw metrics before batch-level comparison decides the final outcome.
+        'kept' and 'runner_up_kept' advance the optimization branch.
+        'deferred_needs_approval' items are re-presented at wrap-up.
+        All other states are terminal for that experiment.
+
+  optional_children:
+
+    changes:
+      type: array
+      description: "Files modified by this experiment"
+      items:
+        type: object
+        children:
+          file:
+            type: string
+          summary:
+            type: string
+
+    gates:
+      type: object
+      description: "Gate metric values from the measurement command"
+
+    gates_passed:
+      type: boolean
+      description: "Whether all degenerate gates passed"
+
+    diagnostics:
+      type: object
+      description: "Diagnostic metric values from the measurement command"
+
+    judge:
+      type: object
+      description: "Judge evaluation scores (only when primary type is 'judge' and gates passed)"
+      children:
+        # All fields from scoring.primary and scoring.secondary appear here
+        # Plus:
+        judge_cost_usd:
+          type: number
+          description: "Cost of judge calls for this experiment"
+
+    primary_delta:
+      type: string
+      description: "Change in primary metric from current best (e.g., '+0.7', '-0.3')"
+
+    learnings:
+      type: string
+      description: "What was learned from this experiment. The agent reads these to avoid re-trying similar approaches and to inform new hypothesis generation."
+
+    commit:
+      type: string
+      description: "Git commit SHA on the optimization branch (only for 'kept' and 'runner_up_kept' outcomes)"
+
+    deferred_reason:
+      type: string
+      description: "Why this experiment was deferred (only for 'deferred_needs_approval' outcome)"
+
+    error_message:
+      type: string
+      description: "Error details (only for 'error' and 'timeout' outcomes)"
+
+    merged_with:
+      type: integer
+      description: "Iteration number of the experiment this was merged with (only for 'runner_up_kept' and 'runner_up_reverted')"
+
+# ============================================================================
+# OUTCOME STATE TRANSITIONS
+# ============================================================================
+#
+# proposed (in hypothesis_backlog)
+#   -> selected for batch
+#     -> experiment dispatched
+#       -> measurement completed
+#         -> gates failed           -> outcome: degenerate
+#         -> measurement error      -> outcome: error
+#         -> measurement timeout    -> outcome: timeout
+#         -> gates passed
+#           -> persist raw metrics   -> outcome: measured
+#           -> judge evaluated (if type: judge)
+#             -> best in batch, improved  -> outcome: kept
+#             -> runner-up, file-disjoint -> cherry-pick + re-measure
+#               -> combined better        -> outcome: runner_up_kept
+#               -> combined not better    -> outcome: runner_up_reverted
+#             -> not improved             -> outcome: reverted
+#       -> needs unapproved dep    -> outcome: deferred_needs_approval
+#
+# Only 'kept' and 'runner_up_kept' produce a commit on the optimization branch.
+# Only 'deferred_needs_approval' items are re-presented at wrap-up for approval.
+
+# ============================================================================
+# STRATEGY DIGEST (separate file)
+# ============================================================================
+#
+# Written after each batch to:
+#   .context/compound-engineering/ce-optimize/<spec-name>/strategy-digest.md
+#
+# Contains a compressed summary of:
+# - What hypothesis categories have been tried
+# - Which approaches succeeded (kept) and which failed (reverted)
+# - The exploration frontier: what hasn't been tried yet
+# - Key learnings that should inform next hypotheses
+#
+# The orchestrator reads the strategy digest (not the full experiment log)
+# when generating new hypotheses between batches.