refactor(ce-doc-review): anchor-based confidence scoring (#622)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 14:54:03 -07:00
parent bd77d5550a
commit 6caf330363
20 changed files with 756 additions and 122 deletions
--- a/tests/fixtures/ce-doc-review/seeded-auth-plan.md
+++ b/tests/fixtures/ce-doc-review/seeded-auth-plan.md
@@ -56,7 +56,7 @@ Seed map (run this plan through ce-doc-review to verify):
    - PII handling during migration window unstated — compliance
      gap independent of premise

- FYI candidates (4, confidence 0.40-0.65 at P3):
+- FYI candidates (4, anchor 50 at P3):
    - naming preference ("AuthContext" vs "SessionContext" — both
      legible in the code)
    - speculative future-work concern (could reuse this for a
@@ -65,7 +65,7 @@ Seed map (run this plan through ce-doc-review to verify):
    - unit-organization preference (could group by route rather
      than by endpoint class — current split also reads fine)

- drop-worthy P3s (3, confidence 0.55-0.74):
+- drop-worthy P3s (3, anchors 0/25):
    - vague performance concern without baseline ("could be slow
      under load")
    - theoretical multi-region concern not relevant to single-region
--- a/tests/fixtures/ce-doc-review/seeded-plan.md
+++ b/tests/fixtures/ce-doc-review/seeded-plan.md
@@ -26,14 +26,14 @@ Seed map (run this plan through ce-doc-review to verify):
  scope-guardian complexity challenge (is this abstraction warranted),
  product-lens trajectory concern (does this paint the system into a
  corner)
- FYI candidates (5, confidence 0.40-0.65 at P3): filename-symmetry
+- FYI candidates (5, anchor 50 at P3): filename-symmetry
  observation, drift note, stylistic preference without evidence of
  impact, speculative future-work concern, subjective readability note
- drop-worthy P3s (3, confidence 0.55-0.74): vague style nitpick, low-
+- drop-worthy P3s (3, anchors 0/25): vague style nitpick, low-
  signal "consider X" residual, theoretical scalability concern without
  current evidence

-The descriptions intentionally vary in evidence quality so the confidence
+The descriptions intentionally vary in evidence quality so the anchor
 gate is exercised.
 -->

@@ -205,7 +205,7 @@ one-command rename. (Seeded manual: scope-guardian complexity challenge

 - The plan's section ordering could be improved; "Miscellaneous Notes"
  feels like a catch-all. (Seeded drop: vague style nitpick at P3,
-  confidence should register below 0.75 gate.)
+  should register at anchor 0 or 25 and drop silently.)
 - Consider whether the schema migration strategy scales if the codebase
  grows 10x. (Seeded drop: theoretical scalability concern without
  current evidence, P3.)
--- a/tests/pipeline-review-contract.test.ts
+++ b/tests/pipeline-review-contract.test.ts
@@ -372,6 +372,51 @@ describe("ce-doc-review contract", () => {
    expect(enumValues).not.toContain("present")
  })

+  test("findings schema enforces discrete confidence anchors", async () => {
+    const schema = JSON.parse(
+      await readRepoFile("plugins/compound-engineering/skills/ce-doc-review/references/findings-schema.json")
+    )
+    const confidence = schema.properties.findings.items.properties.confidence
+
+    // Anchored integer enum, not continuous float
+    expect(confidence.type).toBe("integer")
+    expect(confidence.enum).toEqual([0, 25, 50, 75, 100])
+
+    // No stale continuous-range properties
+    expect(confidence.minimum).toBeUndefined()
+    expect(confidence.maximum).toBeUndefined()
+
+    // Rubric text embedded in the description so persona agents see it
+    expect(confidence.description).toContain("Absolutely certain")
+    expect(confidence.description).toContain("Highly confident")
+    expect(confidence.description).toContain("Moderately confident")
+    expect(confidence.description).toContain("double-checked")
+    expect(confidence.description).toContain("evidence directly confirms")
+  })
+
+  test("subagent template embeds anchor rubric and bans float confidence", async () => {
+    const template = await readRepoFile(
+      "plugins/compound-engineering/skills/ce-doc-review/references/subagent-template.md"
+    )
+
+    // Rubric section embedded verbatim in the persona-facing template
+    expect(template).toContain("Confidence rubric")
+    expect(template).toContain("`0`")
+    expect(template).toContain("`25`")
+    expect(template).toContain("`50`")
+    expect(template).toContain("`75`")
+    expect(template).toContain("`100`")
+
+    // Example finding uses anchor, not float
+    expect(template).toContain('"confidence": 100')
+    expect(template).not.toMatch(/"confidence":\s*0\.\d+/)
+
+    // Advisory observations route to anchor 50, not to a 0.40-0.59 band
+    expect(template).toContain("`confidence: 50`")
+    expect(template).not.toContain("0.40–0.59 LOW/Advisory band")
+    expect(template).not.toContain("0.40-0.59 LOW/Advisory band")
+  })
+
  test("subagent template carries framing guidance and strawman rule", async () => {
    const template = await readRepoFile(
      "plugins/compound-engineering/skills/ce-doc-review/references/subagent-template.md"
@@ -397,30 +442,30 @@ describe("ce-doc-review contract", () => {
    expect(template).toContain("<decision-primer-rules>")
  })

-  test("synthesis pipeline routes three tiers with per-severity gates and FYI subsection", async () => {
+  test("synthesis pipeline routes three tiers with anchor-based gating and FYI subsection", async () => {
    const synthesis = await readRepoFile(
      "plugins/compound-engineering/skills/ce-doc-review/references/synthesis-and-presentation.md"
    )

-    // Per-severity confidence gate with the specific thresholds
-    expect(synthesis).toContain("Per-Severity")
-    expect(synthesis).toMatch(/P0\s*\|\s*0\.50/)
-    expect(synthesis).toMatch(/P1\s*\|\s*0\.60/)
-    expect(synthesis).toMatch(/P2\s*\|\s*0\.65/)
-    expect(synthesis).toMatch(/P3\s*\|\s*0\.75/)
+    // Anchor-based confidence gate
+    expect(synthesis).toContain("Anchor-Based")
+    expect(synthesis).toMatch(/`0`\s*\|/)
+    expect(synthesis).toMatch(/`25`\s*\|/)
+    expect(synthesis).toMatch(/`50`\s*\|/)
+    expect(synthesis).toMatch(/`75`\s*\|/)
+    expect(synthesis).toMatch(/`100`\s*\|/)

-    // FYI floor at 0.40 for low-confidence manual findings
-    expect(synthesis).toContain("0.40")
-    expect(synthesis).toContain("FYI floor")
+    // Anchor 50 routes to FYI, anchors 75/100 enter actionable tier
+    expect(synthesis).toContain("FYI subsection")

-    // Three-tier routing table present
+    // Three-tier routing table present (autofix_class)
    expect(synthesis).toContain("`safe_auto`")
    expect(synthesis).toContain("`gated_auto`")
    expect(synthesis).toContain("`manual`")

-    // Cross-persona agreement boost (replaces residual-concern promotion)
-    expect(synthesis).toContain("Cross-Persona Agreement Boost")
-    expect(synthesis).toContain("+0.10")
+    // Cross-persona agreement promotion (replaces +0.10 boost)
+    expect(synthesis).toContain("Cross-Persona Agreement Promotion")
+    expect(synthesis).toContain("one anchor step")

    // R29 and R30 round-2 rules
    expect(synthesis).toContain("R29 Rejected-Finding Suppression")