Files
claude-engineering-plugin/plugins/compound-engineering/skills/ce-optimize/references/example-judge-spec.yaml

79 lines
1.8 KiB
YAML

# Minimal first-run template for qualitative metrics.
# Start here when true quality requires semantic judgment, not a proxy metric.
name: improve-search-relevance
description: Improve semantic relevance of search results without obvious failures
metric:
primary:
type: judge
name: mean_score
direction: maximize
degenerate_gates:
- name: result_count
check: ">= 5"
description: Return enough results to judge quality
- name: empty_query_failures
check: "== 0"
description: Empty or trivial queries must not fail
diagnostics:
- name: latency_ms
- name: recall_at_10
judge:
rubric: |
Rate each result set from 1-5 for relevance:
- 5: Results are directly relevant and well ordered
- 4: Mostly relevant with minor ordering issues
- 3: Mixed relevance or one obvious miss
- 2: Weak relevance, several misses, or poor ordering
- 1: Mostly irrelevant
Also report: ambiguous (boolean)
scoring:
primary: mean_score
secondary:
- ambiguous_rate
model: haiku
sample_size: 10
batch_size: 5
sample_seed: 42
minimum_improvement: 0.2
max_total_cost_usd: 5
measurement:
command: "python eval_search.py"
timeout_seconds: 300
working_directory: "tools/eval"
scope:
mutable:
- "src/search/"
- "config/search.yaml"
immutable:
- "tools/eval/eval_search.py"
- "tests/fixtures/"
- "docs/"
execution:
mode: serial
backend: worktree
max_concurrent: 1
parallel:
port_strategy: none
shared_files: []
dependencies:
approved: []
constraints:
- "Preserve the existing search response shape"
- "Do not add new dependencies on the first run"
stopping:
max_iterations: 4
max_hours: 1
plateau_iterations: 3
target_reached: true
max_runner_up_merges_per_batch: 0