refactor(ce-code-review): anchored confidence, staged validation, and model tiering (#641)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 21:04:29 -07:00
parent b104ce46be
commit 5a26a8fbd3
28 changed files with 1201 additions and 119 deletions
--- a/plugins/compound-engineering/skills/ce-code-review/references/findings-schema.json
+++ b/plugins/compound-engineering/skills/ce-code-review/references/findings-schema.json
@@ -70,10 +70,9 @@
            "description": "Concrete minimal fix. Omit or null if no good fix is obvious -- a bad suggestion is worse than none."
          },
          "confidence": {
-            "type": "number",
-            "description": "Reviewer confidence in this finding, calibrated per persona",
-            "minimum": 0.0,
-            "maximum": 1.0
+            "type": "integer",
+            "enum": [0, 25, 50, 75, 100],
+            "description": "Anchored confidence score. Use exactly one of 0, 25, 50, 75, 100. Each anchor has a behavioral criterion the reviewer must honestly self-apply. 0: Not confident. This is a false positive that does not stand up to light scrutiny, or a pre-existing issue this PR did not introduce. 25: Somewhat confident. Might be a real issue but could also be a false positive; the reviewer could not verify from the diff and surrounding code alone. 50: Moderately confident. The reviewer verified this is a real issue but it may be a nitpick, narrow edge case, or have minimal practical impact. Relative to the diff's other concerns, it is not very important. Style preferences and subjective improvements land here. 75: Highly confident. The reviewer double-checked the diff and confirmed the issue will affect users, downstream callers, or runtime behavior in normal usage. The bug, vulnerability, or contract violation is clearly present and actionable. 100: Absolutely certain. The issue is verifiable from the code itself -- compile error, type mismatch, definitive logic bug, or an explicit project-standards violation with a quotable rule. No interpretation required."
          },
          "evidence": {
            "type": "array",
@@ -98,14 +97,20 @@
      "description": "Missing test coverage the reviewer identified",
      "items": { "type": "string" }
    }
-    },
+  },

  "_meta": {
+    "confidence_anchors": {
+      "description": "Confidence is one of 5 discrete anchors (0, 25, 50, 75, 100), each tied to a behavioral criterion the reviewer can honestly self-apply. Float values (e.g., 0.73) are not valid -- the model cannot meaningfully calibrate at finer granularity, and discrete anchors prevent false-precision gaming.",
+      "0": "False positive or pre-existing -- do not report",
+      "25": "Speculative; could not verify -- do not report",
+      "50": "Verified real but minor or stylistic -- report only when P0 or when synthesis routes to advisory/soft buckets",
+      "75": "Highly confident, will affect users or runtime in normal usage -- report",
+      "100": "Verifiable from code alone (compile error, type mismatch, definitive logic bug, quoted standards violation) -- report"
+    },
    "confidence_thresholds": {
-      "suppress": "Below 0.60 -- do not report. Finding is speculative noise. Exception: P0 findings at 0.50+ may be reported.",
-      "flag": "0.60-0.69 -- include only when the issue is clearly actionable with concrete evidence.",
-      "confident": "0.70-0.84 -- real and important. Report with full evidence.",
-      "certain": "0.85-1.00 -- verifiable from the code alone. Report."
+      "suppress": "Below anchor 75 -- do not report. Exception: P0 findings at anchor 50+ may be reported (critical-but-uncertain issues must not be silently dropped).",
+      "report": "Anchor 75 or 100 -- include with full evidence."
    },
    "severity_definitions": {
      "P0": "Critical breakage, exploitable vulnerability, data loss/corruption. Must fix before merge.",