refactor(ce-doc-review): anchor-based confidence scoring (#622)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -56,7 +56,7 @@ Seed map (run this plan through ce-doc-review to verify):
|
||||
- PII handling during migration window unstated — compliance
|
||||
gap independent of premise
|
||||
|
||||
- FYI candidates (4, confidence 0.40-0.65 at P3):
|
||||
- FYI candidates (4, anchor 50 at P3):
|
||||
- naming preference ("AuthContext" vs "SessionContext" — both
|
||||
legible in the code)
|
||||
- speculative future-work concern (could reuse this for a
|
||||
@@ -65,7 +65,7 @@ Seed map (run this plan through ce-doc-review to verify):
|
||||
- unit-organization preference (could group by route rather
|
||||
than by endpoint class — current split also reads fine)
|
||||
|
||||
- drop-worthy P3s (3, confidence 0.55-0.74):
|
||||
- drop-worthy P3s (3, anchors 0/25):
|
||||
- vague performance concern without baseline ("could be slow
|
||||
under load")
|
||||
- theoretical multi-region concern not relevant to single-region
|
||||
|
||||
8
tests/fixtures/ce-doc-review/seeded-plan.md
vendored
8
tests/fixtures/ce-doc-review/seeded-plan.md
vendored
@@ -26,14 +26,14 @@ Seed map (run this plan through ce-doc-review to verify):
|
||||
scope-guardian complexity challenge (is this abstraction warranted),
|
||||
product-lens trajectory concern (does this paint the system into a
|
||||
corner)
|
||||
- FYI candidates (5, confidence 0.40-0.65 at P3): filename-symmetry
|
||||
- FYI candidates (5, anchor 50 at P3): filename-symmetry
|
||||
observation, drift note, stylistic preference without evidence of
|
||||
impact, speculative future-work concern, subjective readability note
|
||||
- drop-worthy P3s (3, confidence 0.55-0.74): vague style nitpick, low-
|
||||
- drop-worthy P3s (3, anchors 0/25): vague style nitpick, low-
|
||||
signal "consider X" residual, theoretical scalability concern without
|
||||
current evidence
|
||||
|
||||
The descriptions intentionally vary in evidence quality so the confidence
|
||||
The descriptions intentionally vary in evidence quality so the anchor
|
||||
gate is exercised.
|
||||
-->
|
||||
|
||||
@@ -205,7 +205,7 @@ one-command rename. (Seeded manual: scope-guardian complexity challenge
|
||||
|
||||
- The plan's section ordering could be improved; "Miscellaneous Notes"
|
||||
feels like a catch-all. (Seeded drop: vague style nitpick at P3,
|
||||
confidence should register below 0.75 gate.)
|
||||
should register at anchor 0 or 25 and drop silently.)
|
||||
- Consider whether the schema migration strategy scales if the codebase
|
||||
grows 10x. (Seeded drop: theoretical scalability concern without
|
||||
current evidence, P3.)
|
||||
|
||||
@@ -372,6 +372,51 @@ describe("ce-doc-review contract", () => {
|
||||
expect(enumValues).not.toContain("present")
|
||||
})
|
||||
|
||||
test("findings schema enforces discrete confidence anchors", async () => {
|
||||
const schema = JSON.parse(
|
||||
await readRepoFile("plugins/compound-engineering/skills/ce-doc-review/references/findings-schema.json")
|
||||
)
|
||||
const confidence = schema.properties.findings.items.properties.confidence
|
||||
|
||||
// Anchored integer enum, not continuous float
|
||||
expect(confidence.type).toBe("integer")
|
||||
expect(confidence.enum).toEqual([0, 25, 50, 75, 100])
|
||||
|
||||
// No stale continuous-range properties
|
||||
expect(confidence.minimum).toBeUndefined()
|
||||
expect(confidence.maximum).toBeUndefined()
|
||||
|
||||
// Rubric text embedded in the description so persona agents see it
|
||||
expect(confidence.description).toContain("Absolutely certain")
|
||||
expect(confidence.description).toContain("Highly confident")
|
||||
expect(confidence.description).toContain("Moderately confident")
|
||||
expect(confidence.description).toContain("double-checked")
|
||||
expect(confidence.description).toContain("evidence directly confirms")
|
||||
})
|
||||
|
||||
test("subagent template embeds anchor rubric and bans float confidence", async () => {
|
||||
const template = await readRepoFile(
|
||||
"plugins/compound-engineering/skills/ce-doc-review/references/subagent-template.md"
|
||||
)
|
||||
|
||||
// Rubric section embedded verbatim in the persona-facing template
|
||||
expect(template).toContain("Confidence rubric")
|
||||
expect(template).toContain("`0`")
|
||||
expect(template).toContain("`25`")
|
||||
expect(template).toContain("`50`")
|
||||
expect(template).toContain("`75`")
|
||||
expect(template).toContain("`100`")
|
||||
|
||||
// Example finding uses anchor, not float
|
||||
expect(template).toContain('"confidence": 100')
|
||||
expect(template).not.toMatch(/"confidence":\s*0\.\d+/)
|
||||
|
||||
// Advisory observations route to anchor 50, not to a 0.40-0.59 band
|
||||
expect(template).toContain("`confidence: 50`")
|
||||
expect(template).not.toContain("0.40–0.59 LOW/Advisory band")
|
||||
expect(template).not.toContain("0.40-0.59 LOW/Advisory band")
|
||||
})
|
||||
|
||||
test("subagent template carries framing guidance and strawman rule", async () => {
|
||||
const template = await readRepoFile(
|
||||
"plugins/compound-engineering/skills/ce-doc-review/references/subagent-template.md"
|
||||
@@ -397,30 +442,30 @@ describe("ce-doc-review contract", () => {
|
||||
expect(template).toContain("<decision-primer-rules>")
|
||||
})
|
||||
|
||||
test("synthesis pipeline routes three tiers with per-severity gates and FYI subsection", async () => {
|
||||
test("synthesis pipeline routes three tiers with anchor-based gating and FYI subsection", async () => {
|
||||
const synthesis = await readRepoFile(
|
||||
"plugins/compound-engineering/skills/ce-doc-review/references/synthesis-and-presentation.md"
|
||||
)
|
||||
|
||||
// Per-severity confidence gate with the specific thresholds
|
||||
expect(synthesis).toContain("Per-Severity")
|
||||
expect(synthesis).toMatch(/P0\s*\|\s*0\.50/)
|
||||
expect(synthesis).toMatch(/P1\s*\|\s*0\.60/)
|
||||
expect(synthesis).toMatch(/P2\s*\|\s*0\.65/)
|
||||
expect(synthesis).toMatch(/P3\s*\|\s*0\.75/)
|
||||
// Anchor-based confidence gate
|
||||
expect(synthesis).toContain("Anchor-Based")
|
||||
expect(synthesis).toMatch(/`0`\s*\|/)
|
||||
expect(synthesis).toMatch(/`25`\s*\|/)
|
||||
expect(synthesis).toMatch(/`50`\s*\|/)
|
||||
expect(synthesis).toMatch(/`75`\s*\|/)
|
||||
expect(synthesis).toMatch(/`100`\s*\|/)
|
||||
|
||||
// FYI floor at 0.40 for low-confidence manual findings
|
||||
expect(synthesis).toContain("0.40")
|
||||
expect(synthesis).toContain("FYI floor")
|
||||
// Anchor 50 routes to FYI, anchors 75/100 enter actionable tier
|
||||
expect(synthesis).toContain("FYI subsection")
|
||||
|
||||
// Three-tier routing table present
|
||||
// Three-tier routing table present (autofix_class)
|
||||
expect(synthesis).toContain("`safe_auto`")
|
||||
expect(synthesis).toContain("`gated_auto`")
|
||||
expect(synthesis).toContain("`manual`")
|
||||
|
||||
// Cross-persona agreement boost (replaces residual-concern promotion)
|
||||
expect(synthesis).toContain("Cross-Persona Agreement Boost")
|
||||
expect(synthesis).toContain("+0.10")
|
||||
// Cross-persona agreement promotion (replaces +0.10 boost)
|
||||
expect(synthesis).toContain("Cross-Persona Agreement Promotion")
|
||||
expect(synthesis).toContain("one anchor step")
|
||||
|
||||
// R29 and R30 round-2 rules
|
||||
expect(synthesis).toContain("R29 Rejected-Finding Suppression")
|
||||
|
||||
Reference in New Issue
Block a user