258 lines
9.5 KiB
YAML
258 lines
9.5 KiB
YAML
# Experiment Log Schema
|
|
# This is the canonical schema for the experiment log file that accumulates
|
|
# across an optimization run.
|
|
#
|
|
# Location: .context/compound-engineering/ce-optimize/<spec-name>/experiment-log.yaml
|
|
#
|
|
# PERSISTENCE MODEL:
|
|
# The experiment log on disk is the SINGLE SOURCE OF TRUTH. The agent's
|
|
# in-memory context is expendable and will be compacted during long runs.
|
|
#
|
|
# Write discipline:
|
|
# - Each experiment entry is APPENDED immediately after its measurement
|
|
# completes (SKILL.md step 3.3), before batch evaluation
|
|
# - Outcome fields may be updated in-place after batch evaluation (step 3.5)
|
|
# - The `best` section is updated after each batch if a new best is found
|
|
# - The `hypothesis_backlog` is updated after each batch
|
|
# - The agent re-reads this file from disk at every phase boundary
|
|
#
|
|
# The orchestrator does NOT read the full log each iteration -- it uses a
|
|
# rolling window (last 10 experiments) + a strategy digest file for
|
|
# hypothesis generation. But the full log exists on disk for resume,
|
|
# crash recovery, and post-run analysis.
|
|
|
|
# ============================================================================
|
|
# TOP-LEVEL STRUCTURE
|
|
# ============================================================================
|
|
|
|
structure:
|
|
|
|
spec:
|
|
type: string
|
|
required: true
|
|
description: "Name of the optimization spec this log belongs to"
|
|
|
|
run_id:
|
|
type: string
|
|
required: true
|
|
description: "Unique identifier for this optimization run (timestamp-based). Distinguishes resumed runs from fresh starts."
|
|
|
|
started_at:
|
|
type: string
|
|
format: "ISO 8601 timestamp"
|
|
required: true
|
|
|
|
baseline:
|
|
type: object
|
|
required: true
|
|
description: "Metrics measured on the original code before any optimization"
|
|
children:
|
|
timestamp:
|
|
type: string
|
|
format: "ISO 8601 timestamp"
|
|
gates:
|
|
type: object
|
|
description: "Key-value pairs of gate metric names to their baseline values"
|
|
diagnostics:
|
|
type: object
|
|
description: "Key-value pairs of diagnostic metric names to their baseline values"
|
|
judge:
|
|
type: object
|
|
description: "Judge scores on the baseline (only when primary type is 'judge')"
|
|
children:
|
|
# All fields from the scoring config appear here
|
|
# Plus:
|
|
sample_seed:
|
|
type: integer
|
|
judge_cost_usd:
|
|
type: number
|
|
|
|
experiments:
|
|
type: array
|
|
required: true
|
|
description: "Ordered list of all experiments, including kept, reverted, errored, and deferred"
|
|
items:
|
|
type: object
|
|
# See EXPERIMENT ENTRY below
|
|
|
|
best:
|
|
type: object
|
|
required: true
|
|
description: "Summary of the current best result"
|
|
children:
|
|
iteration:
|
|
type: integer
|
|
description: "Iteration number of the best experiment (use 0 for the baseline snapshot before any experiment is kept)"
|
|
metrics:
|
|
type: object
|
|
description: "All metric values from the current best state (seed with baseline metrics during CP-1)"
|
|
judge:
|
|
type: object
|
|
description: "Judge scores from the best experiment (only when primary type is 'judge')"
|
|
total_judge_cost_usd:
|
|
type: number
|
|
description: "Running total of all judge costs across all experiments"
|
|
|
|
hypothesis_backlog:
|
|
type: array
|
|
description: "Remaining hypotheses not yet tested"
|
|
items:
|
|
type: object
|
|
children:
|
|
description:
|
|
type: string
|
|
category:
|
|
type: string
|
|
priority:
|
|
type: string
|
|
enum: [high, medium, low]
|
|
dep_status:
|
|
type: string
|
|
enum: [approved, needs_approval, not_applicable]
|
|
required_deps:
|
|
type: array
|
|
items:
|
|
type: string
|
|
|
|
# ============================================================================
|
|
# EXPERIMENT ENTRY
|
|
# ============================================================================
|
|
|
|
experiment_entry:
|
|
required_children:
|
|
|
|
iteration:
|
|
type: integer
|
|
description: "Sequential experiment number (1-indexed, monotonically increasing)"
|
|
|
|
batch:
|
|
type: integer
|
|
description: "Batch number this experiment was part of. Multiple experiments in the same batch ran in parallel."
|
|
|
|
hypothesis:
|
|
type: string
|
|
description: "Human-readable description of what this experiment tried"
|
|
|
|
category:
|
|
type: string
|
|
description: "Category for grouping and diversity selection (e.g., signal-extraction, graph-signals, embedding, algorithm, preprocessing)"
|
|
|
|
outcome:
|
|
type: enum
|
|
values:
|
|
- measured # measurement finished and metrics were persisted, awaiting batch evaluation
|
|
- kept # primary metric improved, gates passed -> merged to optimization branch
|
|
- reverted # primary metric did not improve or was worse -> changes discarded
|
|
- degenerate # degenerate gate failed -> immediately reverted, no judge evaluation
|
|
- error # measurement command crashed, timed out, or produced malformed output
|
|
- deferred_needs_approval # experiment needs an unapproved dependency -> set aside for batch approval
|
|
- timeout # measurement command exceeded timeout_seconds
|
|
- runner_up_kept # file-disjoint runner-up that was cherry-picked and re-measured successfully
|
|
- runner_up_reverted # file-disjoint runner-up that was cherry-picked but combined measurement was not better
|
|
description: >
|
|
Load-bearing state: the loop branches on this value.
|
|
'measured' is the only non-terminal state and exists so CP-3 can persist
|
|
raw metrics before batch-level comparison decides the final outcome.
|
|
'kept' and 'runner_up_kept' advance the optimization branch.
|
|
'deferred_needs_approval' items are re-presented at wrap-up.
|
|
All other states are terminal for that experiment.
|
|
|
|
optional_children:
|
|
|
|
changes:
|
|
type: array
|
|
description: "Files modified by this experiment"
|
|
items:
|
|
type: object
|
|
children:
|
|
file:
|
|
type: string
|
|
summary:
|
|
type: string
|
|
|
|
gates:
|
|
type: object
|
|
description: "Gate metric values from the measurement command"
|
|
|
|
gates_passed:
|
|
type: boolean
|
|
description: "Whether all degenerate gates passed"
|
|
|
|
diagnostics:
|
|
type: object
|
|
description: "Diagnostic metric values from the measurement command"
|
|
|
|
judge:
|
|
type: object
|
|
description: "Judge evaluation scores (only when primary type is 'judge' and gates passed)"
|
|
children:
|
|
# All fields from scoring.primary and scoring.secondary appear here
|
|
# Plus:
|
|
judge_cost_usd:
|
|
type: number
|
|
description: "Cost of judge calls for this experiment"
|
|
|
|
primary_delta:
|
|
type: string
|
|
description: "Change in primary metric from current best (e.g., '+0.7', '-0.3')"
|
|
|
|
learnings:
|
|
type: string
|
|
description: "What was learned from this experiment. The agent reads these to avoid re-trying similar approaches and to inform new hypothesis generation."
|
|
|
|
commit:
|
|
type: string
|
|
description: "Git commit SHA on the optimization branch (only for 'kept' and 'runner_up_kept' outcomes)"
|
|
|
|
deferred_reason:
|
|
type: string
|
|
description: "Why this experiment was deferred (only for 'deferred_needs_approval' outcome)"
|
|
|
|
error_message:
|
|
type: string
|
|
description: "Error details (only for 'error' and 'timeout' outcomes)"
|
|
|
|
merged_with:
|
|
type: integer
|
|
description: "Iteration number of the experiment this was merged with (only for 'runner_up_kept' and 'runner_up_reverted')"
|
|
|
|
# ============================================================================
|
|
# OUTCOME STATE TRANSITIONS
|
|
# ============================================================================
|
|
#
|
|
# proposed (in hypothesis_backlog)
|
|
# -> selected for batch
|
|
# -> experiment dispatched
|
|
# -> measurement completed
|
|
# -> gates failed -> outcome: degenerate
|
|
# -> measurement error -> outcome: error
|
|
# -> measurement timeout -> outcome: timeout
|
|
# -> gates passed
|
|
# -> persist raw metrics -> outcome: measured
|
|
# -> judge evaluated (if type: judge)
|
|
# -> best in batch, improved -> outcome: kept
|
|
# -> runner-up, file-disjoint -> cherry-pick + re-measure
|
|
# -> combined better -> outcome: runner_up_kept
|
|
# -> combined not better -> outcome: runner_up_reverted
|
|
# -> not improved -> outcome: reverted
|
|
# -> needs unapproved dep -> outcome: deferred_needs_approval
|
|
#
|
|
# Only 'kept' and 'runner_up_kept' produce a commit on the optimization branch.
|
|
# Only 'deferred_needs_approval' items are re-presented at wrap-up for approval.
|
|
|
|
# ============================================================================
|
|
# STRATEGY DIGEST (separate file)
|
|
# ============================================================================
|
|
#
|
|
# Written after each batch to:
|
|
# .context/compound-engineering/ce-optimize/<spec-name>/strategy-digest.md
|
|
#
|
|
# Contains a compressed summary of:
|
|
# - What hypothesis categories have been tried
|
|
# - Which approaches succeeded (kept) and which failed (reverted)
|
|
# - The exploration frontier: what hasn't been tried yet
|
|
# - Key learnings that should inform next hypotheses
|
|
#
|
|
# The orchestrator reads the strategy digest (not the full experiment log)
|
|
# when generating new hypotheses between batches.
|