79 lines
1.8 KiB
YAML
79 lines
1.8 KiB
YAML
# Minimal first-run template for qualitative metrics.
|
|
# Start here when true quality requires semantic judgment, not a proxy metric.
|
|
|
|
name: improve-search-relevance
|
|
description: Improve semantic relevance of search results without obvious failures
|
|
|
|
metric:
|
|
primary:
|
|
type: judge
|
|
name: mean_score
|
|
direction: maximize
|
|
degenerate_gates:
|
|
- name: result_count
|
|
check: ">= 5"
|
|
description: Return enough results to judge quality
|
|
- name: empty_query_failures
|
|
check: "== 0"
|
|
description: Empty or trivial queries must not fail
|
|
diagnostics:
|
|
- name: latency_ms
|
|
- name: recall_at_10
|
|
judge:
|
|
rubric: |
|
|
Rate each result set from 1-5 for relevance:
|
|
- 5: Results are directly relevant and well ordered
|
|
- 4: Mostly relevant with minor ordering issues
|
|
- 3: Mixed relevance or one obvious miss
|
|
- 2: Weak relevance, several misses, or poor ordering
|
|
- 1: Mostly irrelevant
|
|
Also report: ambiguous (boolean)
|
|
scoring:
|
|
primary: mean_score
|
|
secondary:
|
|
- ambiguous_rate
|
|
model: haiku
|
|
sample_size: 10
|
|
batch_size: 5
|
|
sample_seed: 42
|
|
minimum_improvement: 0.2
|
|
max_total_cost_usd: 5
|
|
|
|
measurement:
|
|
command: "python eval_search.py"
|
|
timeout_seconds: 300
|
|
working_directory: "tools/eval"
|
|
|
|
scope:
|
|
mutable:
|
|
- "src/search/"
|
|
- "config/search.yaml"
|
|
immutable:
|
|
- "tools/eval/eval_search.py"
|
|
- "tests/fixtures/"
|
|
- "docs/"
|
|
|
|
execution:
|
|
mode: serial
|
|
backend: worktree
|
|
max_concurrent: 1
|
|
|
|
parallel:
|
|
port_strategy: none
|
|
shared_files: []
|
|
|
|
dependencies:
|
|
approved: []
|
|
|
|
constraints:
|
|
- "Preserve the existing search response shape"
|
|
- "Do not add new dependencies on the first run"
|
|
|
|
stopping:
|
|
max_iterations: 4
|
|
max_hours: 1
|
|
plateau_iterations: 3
|
|
target_reached: true
|
|
|
|
max_runner_up_merges_per_batch: 0
|