refactor(session-history): move extraction scripts behind skills (#619)

2026-04-21 00:12:11 -07:00
parent e0f2a4f9d7
commit cd4af86e5e
9 changed files with 153 additions and 32 deletions
--- a/plugins/compound-engineering/skills/ce-session-inventory/SKILL.md
+++ b/plugins/compound-engineering/skills/ce-session-inventory/SKILL.md
@@ -0,0 +1,58 @@
+---
+name: ce-session-inventory
+description: "Discover session files for a repo across Claude Code, Codex, and Cursor, and extract session metadata (timestamps, branch, cwd, size, platform). Invoked by session-research agents — not intended for direct user queries."
+user-invocable: false
+context: fork
+---
+
+# Session inventory
+
+Agent-facing primitive. Discover session files and emit session metadata as JSONL across Claude Code, Codex, and Cursor.
+
+This skill exists so that agents researching session history do not need to know the layout of session stores on disk or the JSONL shapes of each platform. The scripts under `scripts/` own that knowledge.
+
+## Arguments
+
+Space-separated positional args:
+
+1. `<repo>` — repo folder name (e.g., `my-project`). Used for directory matching in Claude Code and Cursor, and as the CWD filter for Codex sessions.
+2. `<days>` — scan window in days (e.g., `7`). Session files older than this are skipped.
+3. `<platform>` *(optional)* — one of `claude`, `codex`, `cursor`. Omit to search all three.
+
+## Execution
+
+Run the discovery-plus-metadata pipeline from the skill's own `scripts/` directory:
+
+```bash
+bash scripts/discover-sessions.sh <repo> <days> [--platform <platform>] \
+  | tr '\n' '\0' \
+  | xargs -0 python3 scripts/extract-metadata.py --cwd-filter <repo>
+```
+
+Return the raw stdout verbatim — one JSON object per session, then a final `_meta` line. Callers parse the JSONL directly, so do not paraphrase, reformat, or summarize.
+
+If discovery finds no files, the pipeline still emits a clean `_meta` line (`files_processed: 0`). Return that as-is.
+
+## Output format
+
+Each session line is a JSON object. Common fields across platforms:
+
+- `platform` — `claude`, `codex`, or `cursor`
+- `file` — absolute path to the session JSONL
+- `size` — file size in bytes
+- `ts` — session start timestamp (ISO 8601)
+- `session` — session identifier
+
+Platform-specific fields:
+
+- Claude Code adds `branch` (git branch) and `last_ts` (last message timestamp).
+- Codex adds `cwd` (working directory), `source`, `cli_version`, `model`, `last_ts`.
+- Cursor has no in-file timestamps or metadata — `ts` is derived from file mtime and `session` from the containing directory name.
+
+The final `_meta` line has `files_processed`, `parse_errors`, and optionally `filtered_by_cwd` (count of Codex sessions dropped by the CWD filter).
+
+## Error handling
+
+If the discovery script errors (e.g., unreadable home directory, permission failure), let the error surface to the caller. Do not substitute git log, file listings, or other sources — this skill's contract is session metadata, nothing else.
+
+If `_meta` reports `parse_errors > 0`, return the JSONL as-is. The caller decides how to handle partial data.
--- a/plugins/compound-engineering/skills/ce-session-inventory/scripts/discover-sessions.sh
+++ b/plugins/compound-engineering/skills/ce-session-inventory/scripts/discover-sessions.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+# Discover session files across Claude Code, Codex, and Cursor.
+#
+# Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]
+#
+# Outputs one file path per line. Safe in both bash and zsh (all globs guarded).
+# Pass output to extract-metadata.py:
+#   python3 extract-metadata.py --cwd-filter <repo-name> $(bash discover-sessions.sh <repo-name> 7)
+#
+# Arguments:
+#   repo-name  Folder name of the repo (e.g., "my-repo"). Used for directory matching.
+#   days       Scan window in days (e.g., 7). Files older than this are skipped.
+#   --platform Restrict to a single platform. Omit to search all.
+
+set -euo pipefail
+
+REPO_NAME="${1:?Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]}"
+DAYS="${2:?Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]}"
+PLATFORM="${4:-all}"
+
+# Parse optional --platform flag
+shift 2
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --platform) PLATFORM="$2"; shift 2 ;;
+        *) shift ;;
+    esac
+done
+
+# --- Claude Code ---
+discover_claude() {
+    local base="$HOME/.claude/projects"
+    [ -d "$base" ] || return 0
+
+    # Find all project dirs matching repo name
+    for dir in "$base"/*"$REPO_NAME"*/; do
+        [ -d "$dir" ] || continue
+        find "$dir" -maxdepth 1 -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
+    done
+}
+
+# --- Codex ---
+discover_codex() {
+    for base in "$HOME/.codex/sessions" "$HOME/.agents/sessions"; do
+        [ -d "$base" ] || continue
+
+        # Use mtime-based discovery (consistent with Claude/Cursor) so that
+        # sessions started before the scan window but still active within it
+        # are not missed.
+        find "$base" -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
+    done
+}
+
+# --- Cursor ---
+discover_cursor() {
+    local base="$HOME/.cursor/projects"
+    [ -d "$base" ] || return 0
+
+    for dir in "$base"/*"$REPO_NAME"*/; do
+        [ -d "$dir" ] || continue
+        local transcripts="$dir/agent-transcripts"
+        [ -d "$transcripts" ] || continue
+        find "$transcripts" -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
+    done
+}
+
+# --- Dispatch ---
+case "$PLATFORM" in
+    claude)  discover_claude ;;
+    codex)   discover_codex ;;
+    cursor)  discover_cursor ;;
+    all)
+        discover_claude
+        discover_codex
+        discover_cursor
+        ;;
+    *)
+        echo "Unknown platform: $PLATFORM" >&2
+        exit 1
+        ;;
+esac
--- a/plugins/compound-engineering/skills/ce-session-inventory/scripts/extract-metadata.py
+++ b/plugins/compound-engineering/skills/ce-session-inventory/scripts/extract-metadata.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+"""Extract session metadata from Claude Code, Codex, and Cursor JSONL files.
+
+Batch mode (preferred — one invocation for all files):
+  python3 extract-metadata.py /path/to/dir/*.jsonl
+  python3 extract-metadata.py file1.jsonl file2.jsonl file3.jsonl
+
+Single-file mode (stdin):
+  head -20 <session.jsonl> | python3 extract-metadata.py
+
+Auto-detects platform from the JSONL structure.
+Outputs one JSON object per file, one per line.
+Includes a final _meta line with processing stats.
+"""
+import sys
+import json
+import os
+
+MAX_LINES = 25  # Only need first ~25 lines for metadata
+
+
+def try_claude(lines):
+    for line in lines:
+        try:
+            obj = json.loads(line.strip())
+            if obj.get("type") == "user" and "gitBranch" in obj:
+                return {
+                    "platform": "claude",
+                    "branch": obj["gitBranch"],
+                    "ts": obj.get("timestamp", ""),
+                    "session": obj.get("sessionId", ""),
+                }
+        except (json.JSONDecodeError, KeyError):
+            pass
+    return None
+
+
+def try_codex(lines):
+    meta = {}
+    for line in lines:
+        try:
+            obj = json.loads(line.strip())
+            if obj.get("type") == "session_meta":
+                p = obj.get("payload", {})
+                meta["platform"] = "codex"
+                meta["cwd"] = p.get("cwd", "")
+                meta["session"] = p.get("id", "")
+                meta["ts"] = p.get("timestamp", obj.get("timestamp", ""))
+                meta["source"] = p.get("source", "")
+                meta["cli_version"] = p.get("cli_version", "")
+            elif obj.get("type") == "turn_context":
+                p = obj.get("payload", {})
+                meta["model"] = p.get("model", "")
+                meta["cwd"] = meta.get("cwd") or p.get("cwd", "")
+        except (json.JSONDecodeError, KeyError):
+            pass
+    return meta if meta else None
+
+
+def try_cursor(lines):
+    """Cursor agent transcripts: role-based entries, no timestamps or metadata fields."""
+    for line in lines:
+        try:
+            obj = json.loads(line.strip())
+            # Cursor entries have 'role' at top level but no 'type'
+            if obj.get("role") in ("user", "assistant") and "type" not in obj:
+                return {"platform": "cursor"}
+        except (json.JSONDecodeError, KeyError):
+            pass
+    return None
+
+
+def extract_from_lines(lines):
+    return try_claude(lines) or try_codex(lines) or try_cursor(lines)
+
+
+TAIL_BYTES = 16384  # Read last 16KB to find final timestamp past trailing metadata
+
+
+def get_last_timestamp(filepath, size):
+    """Read the tail of a file to find the last message with a timestamp."""
+    try:
+        with open(filepath, "rb") as f:
+            f.seek(max(0, size - TAIL_BYTES))
+            tail = f.read().decode("utf-8", errors="ignore")
+            lines = tail.strip().split("\n")
+        for line in reversed(lines):
+            try:
+                obj = json.loads(line.strip())
+                if "timestamp" in obj:
+                    return obj["timestamp"]
+            except (json.JSONDecodeError, KeyError):
+                pass
+    except (OSError, IOError):
+        pass
+    return None
+
+
+def process_file(filepath):
+    try:
+        size = os.path.getsize(filepath)
+        with open(filepath, "r") as f:
+            lines = []
+            for i, line in enumerate(f):
+                if i >= MAX_LINES:
+                    break
+                lines.append(line)
+        result = extract_from_lines(lines)
+        if result:
+            result["file"] = filepath
+            result["size"] = size
+            if result["platform"] == "cursor":
+                # Cursor transcripts have no timestamps in JSONL.
+                # Use file modification time as the best available signal.
+                # Derive session ID from the parent directory name (UUID).
+                mtime = os.path.getmtime(filepath)
+                from datetime import datetime, timezone
+
+                result["ts"] = datetime.fromtimestamp(mtime, tz=timezone.utc).isoformat()
+                result["session"] = os.path.basename(os.path.dirname(filepath))
+            else:
+                last_ts = get_last_timestamp(filepath, size)
+                if last_ts:
+                    result["last_ts"] = last_ts
+            return result, None
+        else:
+            return None, filepath
+    except (OSError, IOError) as e:
+        return None, filepath
+
+
+# Parse arguments: files and optional --cwd-filter <substring>
+files = []
+cwd_filter = None
+args = sys.argv[1:]
+i = 0
+while i < len(args):
+    if args[i] == "--cwd-filter" and i + 1 < len(args):
+        cwd_filter = args[i + 1]
+        i += 2
+    elif not args[i].startswith("-"):
+        files.append(args[i])
+        i += 1
+    else:
+        i += 1
+
+if files:
+    # Batch mode: process all files
+    processed = 0
+    parse_errors = 0
+    filtered = 0
+    for filepath in files:
+        if not filepath.endswith(".jsonl"):
+            continue
+        result, error = process_file(filepath)
+        processed += 1
+        if result:
+            # Apply CWD filter: skip Codex sessions from other repos
+            if cwd_filter and result.get("cwd") and cwd_filter not in result["cwd"]:
+                filtered += 1
+                continue
+            print(json.dumps(result))
+        elif error:
+            parse_errors += 1
+
+    meta = {"_meta": True, "files_processed": processed, "parse_errors": parse_errors}
+    if filtered:
+        meta["filtered_by_cwd"] = filtered
+    print(json.dumps(meta))
+else:
+    # No file arguments: either single-file stdin mode or empty xargs invocation.
+    # When xargs runs us with no input (e.g., discover found no files), stdin is
+    # empty or a TTY — emit a clean zero-file result instead of a false parse error.
+    if sys.stdin.isatty():
+        lines = []
+    else:
+        lines = list(sys.stdin)
+
+    if not lines:
+        # No input at all — zero-file result (clean exit for empty pipelines)
+        print(json.dumps({"_meta": True, "files_processed": 0, "parse_errors": 0}))
+    else:
+        # Genuine single-file stdin mode (backward compatible)
+        result = extract_from_lines(lines)
+        if result:
+            print(json.dumps(result))
+        print(json.dumps({"_meta": True, "files_processed": 1, "parse_errors": 0 if result else 1}))