feat: promote ce:review-beta to stable ce:review (#371)

2026-03-24 21:00:38 -07:00
parent 4e3af07962
commit 7c5ff445e3
25 changed files with 556 additions and 1137 deletions
--- a/plugins/compound-engineering/skills/ce-review/SKILL.md
+++ b/plugins/compound-engineering/skills/ce-review/SKILL.md
--- a/plugins/compound-engineering/skills/ce-review/references/diff-scope.md
+++ b/plugins/compound-engineering/skills/ce-review/references/diff-scope.md
@@ -0,0 +1,31 @@
+# Diff Scope Rules
+
+These rules apply to every reviewer. They define what is "your code to review" versus pre-existing context.
+
+## Scope Discovery
+
+Determine the diff to review using this priority order:
+
+1. **User-specified scope.** If the caller passed `BASE:`, `FILES:`, or `DIFF:` markers, use that scope exactly.
+2. **Working copy changes.** If there are unstaged or staged changes (`git diff HEAD` is non-empty), review those.
+3. **Unpushed commits vs base branch.** If the working copy is clean, review `git diff $(git merge-base HEAD <base>)..HEAD` where `<base>` is the default branch (main or master).
+
+The scope step in the SKILL.md handles discovery and passes you the resolved diff. You do not need to run git commands yourself.
+
+## Finding Classification Tiers
+
+Every finding you report falls into one of three tiers based on its relationship to the diff:
+
+### Primary (directly changed code)
+
+Lines added or modified in the diff. This is your main focus. Report findings against these lines at full confidence.
+
+### Secondary (immediately surrounding code)
+
+Unchanged code within the same function, method, or block as a changed line. If a change introduces a bug that's only visible by reading the surrounding context, report it -- but note that the issue exists in the interaction between new and existing code.
+
+### Pre-existing (unrelated to this diff)
+
+Issues in unchanged code that the diff didn't touch and doesn't interact with. Mark these as `"pre_existing": true` in your output. They're reported separately and don't count toward the review verdict.
+
+**The rule:** If you'd flag the same issue on an identical diff that didn't include the surrounding file, it's pre-existing. If the diff makes the issue *newly relevant* (e.g., a new caller hits an existing buggy function), it's secondary.
--- a/plugins/compound-engineering/skills/ce-review/references/findings-schema.json
+++ b/plugins/compound-engineering/skills/ce-review/references/findings-schema.json
@@ -0,0 +1,128 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Code Review Findings",
+  "description": "Structured output schema for code review sub-agents",
+  "type": "object",
+  "required": ["reviewer", "findings", "residual_risks", "testing_gaps"],
+  "properties": {
+    "reviewer": {
+      "type": "string",
+      "description": "Persona name that produced this output (e.g., 'correctness', 'security')"
+    },
+    "findings": {
+      "type": "array",
+      "description": "List of code review findings. Empty array if no issues found.",
+      "items": {
+        "type": "object",
+        "required": [
+          "title",
+          "severity",
+          "file",
+          "line",
+          "why_it_matters",
+          "autofix_class",
+          "owner",
+          "requires_verification",
+          "confidence",
+          "evidence",
+          "pre_existing"
+        ],
+        "properties": {
+          "title": {
+            "type": "string",
+            "description": "Short, specific issue title. 10 words or fewer.",
+            "maxLength": 100
+          },
+          "severity": {
+            "type": "string",
+            "enum": ["P0", "P1", "P2", "P3"],
+            "description": "Issue severity level"
+          },
+          "file": {
+            "type": "string",
+            "description": "Relative file path from repository root"
+          },
+          "line": {
+            "type": "integer",
+            "description": "Primary line number of the issue",
+            "minimum": 1
+          },
+          "why_it_matters": {
+            "type": "string",
+            "description": "Impact and failure mode -- not 'what is wrong' but 'what breaks'"
+          },
+          "autofix_class": {
+            "type": "string",
+            "enum": ["safe_auto", "gated_auto", "manual", "advisory"],
+            "description": "Reviewer's conservative recommendation for how this issue should be handled after synthesis"
+          },
+          "owner": {
+            "type": "string",
+            "enum": ["review-fixer", "downstream-resolver", "human", "release"],
+            "description": "Who should own the next action for this finding after synthesis"
+          },
+          "requires_verification": {
+            "type": "boolean",
+            "description": "Whether any fix for this finding must be re-verified with targeted tests or a follow-up review pass"
+          },
+          "suggested_fix": {
+            "type": ["string", "null"],
+            "description": "Concrete minimal fix. Omit or null if no good fix is obvious -- a bad suggestion is worse than none."
+          },
+          "confidence": {
+            "type": "number",
+            "description": "Reviewer confidence in this finding, calibrated per persona",
+            "minimum": 0.0,
+            "maximum": 1.0
+          },
+          "evidence": {
+            "type": "array",
+            "description": "Code-grounded evidence: snippets, line references, or pattern descriptions. At least 1 item.",
+            "items": { "type": "string" },
+            "minItems": 1
+          },
+          "pre_existing": {
+            "type": "boolean",
+            "description": "True if this issue exists in unchanged code unrelated to the current diff"
+          }
+        }
+      }
+    },
+    "residual_risks": {
+      "type": "array",
+      "description": "Risks the reviewer noticed but could not confirm as findings",
+      "items": { "type": "string" }
+    },
+    "testing_gaps": {
+      "type": "array",
+      "description": "Missing test coverage the reviewer identified",
+      "items": { "type": "string" }
+    }
+    },
+
+  "_meta": {
+    "confidence_thresholds": {
+      "suppress": "Below 0.60 -- do not report. Finding is speculative noise.",
+      "flag": "0.60-0.69 -- include only when the persona's calibration says the issue is actionable at that confidence.",
+      "report": "0.70+ -- report with full confidence."
+    },
+    "severity_definitions": {
+      "P0": "Critical breakage, exploitable vulnerability, data loss/corruption. Must fix before merge.",
+      "P1": "High-impact defect likely hit in normal usage, breaking contract. Should fix.",
+      "P2": "Moderate issue with meaningful downside (edge case, perf regression, maintainability trap). Fix if straightforward.",
+      "P3": "Low-impact, narrow scope, minor improvement. User's discretion."
+    },
+    "autofix_classes": {
+      "safe_auto": "Local, deterministic code or test fix suitable for the in-skill fixer in autonomous mode.",
+      "gated_auto": "Concrete fix exists, but it changes behavior, permissions, contracts, or other sensitive areas that deserve explicit approval.",
+      "manual": "Actionable issue that should become residual work rather than an in-skill autofix.",
+      "advisory": "Informational or operational item that should be surfaced in the report only."
+    },
+    "owners": {
+      "review-fixer": "The in-skill fixer can own this when policy allows.",
+      "downstream-resolver": "Turn this into residual work for later resolution.",
+      "human": "A person must make a judgment call before code changes should continue.",
+      "release": "Operational or rollout follow-up; do not convert into code-fix work automatically."
+    }
+  }
+}
--- a/plugins/compound-engineering/skills/ce-review/references/persona-catalog.md
+++ b/plugins/compound-engineering/skills/ce-review/references/persona-catalog.md
@@ -0,0 +1,50 @@
+# Persona Catalog
+
+8 reviewer personas organized in two tiers, plus CE-specific agents. The orchestrator uses this catalog to select which reviewers to spawn for each review.
+
+## Always-on (3 personas + 2 CE agents)
+
+Spawned on every review regardless of diff content.
+
+**Persona agents (structured JSON output):**
+
+| Persona | Agent | Focus |
+|---------|-------|-------|
+| `correctness` | `compound-engineering:review:correctness-reviewer` | Logic errors, edge cases, state bugs, error propagation, intent compliance |
+| `testing` | `compound-engineering:review:testing-reviewer` | Coverage gaps, weak assertions, brittle tests, missing edge case tests |
+| `maintainability` | `compound-engineering:review:maintainability-reviewer` | Coupling, complexity, naming, dead code, premature abstraction |
+
+**CE agents (unstructured output, synthesized separately):**
+
+| Agent | Focus |
+|-------|-------|
+| `compound-engineering:review:agent-native-reviewer` | Verify new features are agent-accessible |
+| `compound-engineering:research:learnings-researcher` | Search docs/solutions/ for past issues related to this PR's modules and patterns |
+
+## Conditional (5 personas)
+
+Spawned when the orchestrator identifies relevant patterns in the diff. The orchestrator reads the full diff and reasons about selection -- this is agent judgment, not keyword matching.
+
+| Persona | Agent | Select when diff touches... |
+|---------|-------|---------------------------|
+| `security` | `compound-engineering:review:security-reviewer` | Auth middleware, public endpoints, user input handling, permission checks, secrets management |
+| `performance` | `compound-engineering:review:performance-reviewer` | Database queries, ORM calls, loop-heavy data transforms, caching layers, async/concurrent code |
+| `api-contract` | `compound-engineering:review:api-contract-reviewer` | Route definitions, serializer/interface changes, event schemas, exported type signatures, API versioning |
+| `data-migrations` | `compound-engineering:review:data-migrations-reviewer` | Migration files, schema changes, backfill scripts, data transformations |
+| `reliability` | `compound-engineering:review:reliability-reviewer` | Error handling, retry logic, circuit breakers, timeouts, background jobs, async handlers, health checks |
+
+## CE Conditional Agents (migration-specific)
+
+These CE-native agents provide specialized analysis beyond what the persona agents cover. Spawn them when the diff includes database migrations, schema.rb, or data backfills.
+
+| Agent | Focus |
+|-------|-------|
+| `compound-engineering:review:schema-drift-detector` | Cross-references schema.rb changes against included migrations to catch unrelated drift |
+| `compound-engineering:review:deployment-verification-agent` | Produces Go/No-Go deployment checklist with SQL verification queries and rollback procedures |
+
+## Selection rules
+
+1. **Always spawn all 3 always-on personas** plus the 2 CE always-on agents.
+2. **For each conditional persona**, the orchestrator reads the diff and decides whether the persona's domain is relevant. This is a judgment call, not a keyword match.
+3. **For CE conditional agents**, spawn when the diff includes migration files (`db/migrate/*.rb`, `db/schema.rb`) or data backfill scripts.
+4. **Announce the team** before spawning with a one-line justification per conditional reviewer selected.
--- a/plugins/compound-engineering/skills/ce-review/references/review-output-template.md
+++ b/plugins/compound-engineering/skills/ce-review/references/review-output-template.md
@@ -0,0 +1,115 @@
+# Code Review Output Template
+
+Use this **exact format** when presenting synthesized review findings. Findings are grouped by severity, not by reviewer.
+
+**IMPORTANT:** Use pipe-delimited markdown tables (`| col | col |`). Do NOT use ASCII box-drawing characters.
+
+## Example
+
+```markdown
+## Code Review Results
+
+**Scope:** merge-base with the review base branch -> working tree (14 files, 342 lines)
+**Intent:** Add order export endpoint with CSV and JSON format support
+**Mode:** autofix
+
+**Reviewers:** correctness, testing, maintainability, security, api-contract
+- security -- new public endpoint accepts user-provided format parameter
+- api-contract -- new /api/orders/export route with response schema
+
+### P0 -- Critical
+
+| # | File | Issue | Reviewer | Confidence | Route |
+|---|------|-------|----------|------------|-------|
+| 1 | `orders_controller.rb:42` | User-supplied ID in account lookup without ownership check | security | 0.92 | `gated_auto -> downstream-resolver` |
+
+### P1 -- High
+
+| # | File | Issue | Reviewer | Confidence | Route |
+|---|------|-------|----------|------------|-------|
+| 2 | `export_service.rb:87` | Loads all orders into memory -- unbounded for large accounts | performance | 0.85 | `safe_auto -> review-fixer` |
+| 3 | `export_service.rb:91` | No pagination -- response size grows linearly with order count | api-contract, performance | 0.80 | `manual -> downstream-resolver` |
+
+### P2 -- Moderate
+
+| # | File | Issue | Reviewer | Confidence | Route |
+|---|------|-------|----------|------------|-------|
+| 4 | `export_service.rb:45` | Missing error handling for CSV serialization failure | correctness | 0.75 | `safe_auto -> review-fixer` |
+
+### P3 -- Low
+
+| # | File | Issue | Reviewer | Confidence | Route |
+|---|------|-------|----------|------------|-------|
+| 5 | `export_helper.rb:12` | Format detection could use early return instead of nested conditional | maintainability | 0.70 | `advisory -> human` |
+
+### Applied Fixes
+
+- `safe_auto`: Added bounded export pagination guard and CSV serialization failure test coverage in this run
+
+### Residual Actionable Work
+
+| # | File | Issue | Route | Next Step |
+|---|------|-------|-------|-----------|
+| 1 | `orders_controller.rb:42` | Ownership check missing on export lookup | `gated_auto -> downstream-resolver` | Create residual todo and require explicit approval before behavior change |
+| 2 | `export_service.rb:91` | Pagination contract needs a broader API decision | `manual -> downstream-resolver` | Create residual todo with contract and client impact details |
+
+### Pre-existing Issues
+
+| # | File | Issue | Reviewer |
+|---|------|-------|----------|
+| 1 | `orders_controller.rb:12` | Broad rescue masking failed permission check | correctness |
+
+### Learnings & Past Solutions
+
+- [Known Pattern] `docs/solutions/export-pagination.md` -- previous export pagination fix applies to this endpoint
+
+### Agent-Native Gaps
+
+- New export endpoint has no CLI/agent equivalent -- agent users cannot trigger exports
+
+### Schema Drift Check
+
+- Clean: schema.rb changes match the migrations in scope
+
+### Deployment Notes
+
+- Pre-deploy: capture baseline row counts before enabling the export backfill
+- Verify: `SELECT COUNT(*) FROM exports WHERE status IS NULL;` should stay at `0`
+- Rollback: keep the old export path available until the backfill has been validated
+
+### Coverage
+
+- Suppressed: 2 findings below 0.60 confidence
+- Residual risks: No rate limiting on export endpoint
+- Testing gaps: No test for concurrent export requests
+
+---
+
+> **Verdict:** Ready with fixes
+>
+> **Reasoning:** 1 critical auth bypass must be fixed. The memory/pagination issues (P1) should be addressed for production safety.
+>
+> **Fix order:** P0 auth bypass -> P1 memory/pagination -> P2 error handling if straightforward
+```
+
+## Formatting Rules
+
+- **Pipe-delimited markdown tables** -- never ASCII box-drawing characters
+- **Severity-grouped sections** -- `### P0 -- Critical`, `### P1 -- High`, `### P2 -- Moderate`, `### P3 -- Low`. Omit empty severity levels.
+- **Always include file:line location** for code review issues
+- **Reviewer column** shows which persona(s) flagged the issue. Multiple reviewers = cross-reviewer agreement.
+- **Confidence column** shows the finding's confidence score
+- **Route column** shows the synthesized handling decision as ``<autofix_class> -> <owner>``.
+- **Header includes** scope, intent, and reviewer team with per-conditional justifications
+- **Mode line** -- include `interactive`, `autofix`, or `report-only`
+- **Applied Fixes section** -- include only when a fix phase ran in this review invocation
+- **Residual Actionable Work section** -- include only when unresolved actionable findings were handed off for later work
+- **Pre-existing section** -- separate table, no confidence column (these are informational)
+- **Learnings & Past Solutions section** -- results from learnings-researcher, with links to docs/solutions/ files
+- **Agent-Native Gaps section** -- results from agent-native-reviewer. Omit if no gaps found.
+- **Schema Drift Check section** -- results from schema-drift-detector. Omit if the agent did not run.
+- **Deployment Notes section** -- key checklist items from deployment-verification-agent. Omit if the agent did not run.
+- **Coverage section** -- suppressed count, residual risks, testing gaps, failed reviewers
+- **Summary uses blockquotes** for verdict, reasoning, and fix order
+- **Horizontal rule** (`---`) separates findings from verdict
+- **`###` headers** for each section -- never plain text headers
--- a/plugins/compound-engineering/skills/ce-review/references/subagent-template.md
+++ b/plugins/compound-engineering/skills/ce-review/references/subagent-template.md
@@ -0,0 +1,56 @@
+# Sub-agent Prompt Template
+
+This template is used by the orchestrator to spawn each reviewer sub-agent. Variable substitution slots are filled at spawn time.
+
+---
+
+## Template
+
+```
+You are a specialist code reviewer.
+
+<persona>
+{persona_file}
+</persona>
+
+<scope-rules>
+{diff_scope_rules}
+</scope-rules>
+
+<output-contract>
+Return ONLY valid JSON matching the findings schema below. No prose, no markdown, no explanation outside the JSON object.
+
+{schema}
+
+Rules:
+- Suppress any finding below your stated confidence floor (see your Confidence calibration section).
+- Every finding MUST include at least one evidence item grounded in the actual code.
+- Set pre_existing to true ONLY for issues in unchanged code that are unrelated to this diff. If the diff makes the issue newly relevant, it is NOT pre-existing.
+- You are operationally read-only. You may use non-mutating inspection commands, including read-oriented `git` / `gh` commands, to gather evidence. Do not edit files, change branches, commit, push, create PRs, or otherwise mutate the checkout or repository state.
+- Set `autofix_class` conservatively. Use `safe_auto` only when the fix is local, deterministic, and low-risk. Use `gated_auto` when a concrete fix exists but changes behavior/contracts/permissions. Use `manual` for actionable residual work. Use `advisory` for report-only items that should not become code-fix work.
+- Set `owner` to the default next actor for this finding: `review-fixer`, `downstream-resolver`, `human`, or `release`.
+- Set `requires_verification` to true whenever the likely fix needs targeted tests, a focused re-review, or operational validation before it should be trusted.
+- suggested_fix is optional. Only include it when the fix is obvious and correct. A bad suggestion is worse than none.
+- If you find no issues, return an empty findings array. Still populate residual_risks and testing_gaps if applicable.
+</output-contract>
+
+<review-context>
+Intent: {intent_summary}
+
+Changed files: {file_list}
+
+Diff:
+{diff}
+</review-context>
+```
+
+## Variable Reference
+
+| Variable | Source | Description |
+|----------|--------|-------------|
+| `{persona_file}` | Agent markdown file content | The full persona definition (identity, failure modes, calibration, suppress conditions) |
+| `{diff_scope_rules}` | `references/diff-scope.md` content | Primary/secondary/pre-existing tier rules |
+| `{schema}` | `references/findings-schema.json` content | The JSON schema reviewers must conform to |
+| `{intent_summary}` | Stage 2 output | 2-3 line description of what the change is trying to accomplish |
+| `{file_list}` | Stage 1 output | List of changed files from the scope step |
+| `{diff}` | Stage 1 output | The actual diff content to review |