Files

258 lines
9.5 KiB
YAML

# Experiment Log Schema
# This is the canonical schema for the experiment log file that accumulates
# across an optimization run.
#
# Location: .context/compound-engineering/ce-optimize/<spec-name>/experiment-log.yaml
#
# PERSISTENCE MODEL:
# The experiment log on disk is the SINGLE SOURCE OF TRUTH. The agent's
# in-memory context is expendable and will be compacted during long runs.
#
# Write discipline:
# - Each experiment entry is APPENDED immediately after its measurement
# completes (SKILL.md step 3.3), before batch evaluation
# - Outcome fields may be updated in-place after batch evaluation (step 3.5)
# - The `best` section is updated after each batch if a new best is found
# - The `hypothesis_backlog` is updated after each batch
# - The agent re-reads this file from disk at every phase boundary
#
# The orchestrator does NOT read the full log each iteration -- it uses a
# rolling window (last 10 experiments) + a strategy digest file for
# hypothesis generation. But the full log exists on disk for resume,
# crash recovery, and post-run analysis.
# ============================================================================
# TOP-LEVEL STRUCTURE
# ============================================================================
structure:
spec:
type: string
required: true
description: "Name of the optimization spec this log belongs to"
run_id:
type: string
required: true
description: "Unique identifier for this optimization run (timestamp-based). Distinguishes resumed runs from fresh starts."
started_at:
type: string
format: "ISO 8601 timestamp"
required: true
baseline:
type: object
required: true
description: "Metrics measured on the original code before any optimization"
children:
timestamp:
type: string
format: "ISO 8601 timestamp"
gates:
type: object
description: "Key-value pairs of gate metric names to their baseline values"
diagnostics:
type: object
description: "Key-value pairs of diagnostic metric names to their baseline values"
judge:
type: object
description: "Judge scores on the baseline (only when primary type is 'judge')"
children:
# All fields from the scoring config appear here
# Plus:
sample_seed:
type: integer
judge_cost_usd:
type: number
experiments:
type: array
required: true
description: "Ordered list of all experiments, including kept, reverted, errored, and deferred"
items:
type: object
# See EXPERIMENT ENTRY below
best:
type: object
required: true
description: "Summary of the current best result"
children:
iteration:
type: integer
description: "Iteration number of the best experiment (use 0 for the baseline snapshot before any experiment is kept)"
metrics:
type: object
description: "All metric values from the current best state (seed with baseline metrics during CP-1)"
judge:
type: object
description: "Judge scores from the best experiment (only when primary type is 'judge')"
total_judge_cost_usd:
type: number
description: "Running total of all judge costs across all experiments"
hypothesis_backlog:
type: array
description: "Remaining hypotheses not yet tested"
items:
type: object
children:
description:
type: string
category:
type: string
priority:
type: string
enum: [high, medium, low]
dep_status:
type: string
enum: [approved, needs_approval, not_applicable]
required_deps:
type: array
items:
type: string
# ============================================================================
# EXPERIMENT ENTRY
# ============================================================================
experiment_entry:
required_children:
iteration:
type: integer
description: "Sequential experiment number (1-indexed, monotonically increasing)"
batch:
type: integer
description: "Batch number this experiment was part of. Multiple experiments in the same batch ran in parallel."
hypothesis:
type: string
description: "Human-readable description of what this experiment tried"
category:
type: string
description: "Category for grouping and diversity selection (e.g., signal-extraction, graph-signals, embedding, algorithm, preprocessing)"
outcome:
type: enum
values:
- measured # measurement finished and metrics were persisted, awaiting batch evaluation
- kept # primary metric improved, gates passed -> merged to optimization branch
- reverted # primary metric did not improve or was worse -> changes discarded
- degenerate # degenerate gate failed -> immediately reverted, no judge evaluation
- error # measurement command crashed, timed out, or produced malformed output
- deferred_needs_approval # experiment needs an unapproved dependency -> set aside for batch approval
- timeout # measurement command exceeded timeout_seconds
- runner_up_kept # file-disjoint runner-up that was cherry-picked and re-measured successfully
- runner_up_reverted # file-disjoint runner-up that was cherry-picked but combined measurement was not better
description: >
Load-bearing state: the loop branches on this value.
'measured' is the only non-terminal state and exists so CP-3 can persist
raw metrics before batch-level comparison decides the final outcome.
'kept' and 'runner_up_kept' advance the optimization branch.
'deferred_needs_approval' items are re-presented at wrap-up.
All other states are terminal for that experiment.
optional_children:
changes:
type: array
description: "Files modified by this experiment"
items:
type: object
children:
file:
type: string
summary:
type: string
gates:
type: object
description: "Gate metric values from the measurement command"
gates_passed:
type: boolean
description: "Whether all degenerate gates passed"
diagnostics:
type: object
description: "Diagnostic metric values from the measurement command"
judge:
type: object
description: "Judge evaluation scores (only when primary type is 'judge' and gates passed)"
children:
# All fields from scoring.primary and scoring.secondary appear here
# Plus:
judge_cost_usd:
type: number
description: "Cost of judge calls for this experiment"
primary_delta:
type: string
description: "Change in primary metric from current best (e.g., '+0.7', '-0.3')"
learnings:
type: string
description: "What was learned from this experiment. The agent reads these to avoid re-trying similar approaches and to inform new hypothesis generation."
commit:
type: string
description: "Git commit SHA on the optimization branch (only for 'kept' and 'runner_up_kept' outcomes)"
deferred_reason:
type: string
description: "Why this experiment was deferred (only for 'deferred_needs_approval' outcome)"
error_message:
type: string
description: "Error details (only for 'error' and 'timeout' outcomes)"
merged_with:
type: integer
description: "Iteration number of the experiment this was merged with (only for 'runner_up_kept' and 'runner_up_reverted')"
# ============================================================================
# OUTCOME STATE TRANSITIONS
# ============================================================================
#
# proposed (in hypothesis_backlog)
# -> selected for batch
# -> experiment dispatched
# -> measurement completed
# -> gates failed -> outcome: degenerate
# -> measurement error -> outcome: error
# -> measurement timeout -> outcome: timeout
# -> gates passed
# -> persist raw metrics -> outcome: measured
# -> judge evaluated (if type: judge)
# -> best in batch, improved -> outcome: kept
# -> runner-up, file-disjoint -> cherry-pick + re-measure
# -> combined better -> outcome: runner_up_kept
# -> combined not better -> outcome: runner_up_reverted
# -> not improved -> outcome: reverted
# -> needs unapproved dep -> outcome: deferred_needs_approval
#
# Only 'kept' and 'runner_up_kept' produce a commit on the optimization branch.
# Only 'deferred_needs_approval' items are re-presented at wrap-up for approval.
# ============================================================================
# STRATEGY DIGEST (separate file)
# ============================================================================
#
# Written after each batch to:
# .context/compound-engineering/ce-optimize/<spec-name>/strategy-digest.md
#
# Contains a compressed summary of:
# - What hypothesis categories have been tried
# - Which approaches succeeded (kept) and which failed (reverted)
# - The exploration frontier: what hasn't been tried yet
# - Key learnings that should inform next hypotheses
#
# The orchestrator reads the strategy digest (not the full experiment log)
# when generating new hypotheses between batches.