refactor(session-history): move extraction scripts behind skills (#619)

2026-04-21 00:12:11 -07:00
parent e0f2a4f9d7
commit cd4af86e5e
9 changed files with 153 additions and 32 deletions
--- a/plugins/compound-engineering/skills/ce-session-inventory/scripts/discover-sessions.sh
+++ b/plugins/compound-engineering/skills/ce-session-inventory/scripts/discover-sessions.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+# Discover session files across Claude Code, Codex, and Cursor.
+#
+# Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]
+#
+# Outputs one file path per line. Safe in both bash and zsh (all globs guarded).
+# Pass output to extract-metadata.py:
+#   python3 extract-metadata.py --cwd-filter <repo-name> $(bash discover-sessions.sh <repo-name> 7)
+#
+# Arguments:
+#   repo-name  Folder name of the repo (e.g., "my-repo"). Used for directory matching.
+#   days       Scan window in days (e.g., 7). Files older than this are skipped.
+#   --platform Restrict to a single platform. Omit to search all.
+
+set -euo pipefail
+
+REPO_NAME="${1:?Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]}"
+DAYS="${2:?Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]}"
+PLATFORM="${4:-all}"
+
+# Parse optional --platform flag
+shift 2
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --platform) PLATFORM="$2"; shift 2 ;;
+        *) shift ;;
+    esac
+done
+
+# --- Claude Code ---
+discover_claude() {
+    local base="$HOME/.claude/projects"
+    [ -d "$base" ] || return 0
+
+    # Find all project dirs matching repo name
+    for dir in "$base"/*"$REPO_NAME"*/; do
+        [ -d "$dir" ] || continue
+        find "$dir" -maxdepth 1 -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
+    done
+}
+
+# --- Codex ---
+discover_codex() {
+    for base in "$HOME/.codex/sessions" "$HOME/.agents/sessions"; do
+        [ -d "$base" ] || continue
+
+        # Use mtime-based discovery (consistent with Claude/Cursor) so that
+        # sessions started before the scan window but still active within it
+        # are not missed.
+        find "$base" -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
+    done
+}
+
+# --- Cursor ---
+discover_cursor() {
+    local base="$HOME/.cursor/projects"
+    [ -d "$base" ] || return 0
+
+    for dir in "$base"/*"$REPO_NAME"*/; do
+        [ -d "$dir" ] || continue
+        local transcripts="$dir/agent-transcripts"
+        [ -d "$transcripts" ] || continue
+        find "$transcripts" -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
+    done
+}
+
+# --- Dispatch ---
+case "$PLATFORM" in
+    claude)  discover_claude ;;
+    codex)   discover_codex ;;
+    cursor)  discover_cursor ;;
+    all)
+        discover_claude
+        discover_codex
+        discover_cursor
+        ;;
+    *)
+        echo "Unknown platform: $PLATFORM" >&2
+        exit 1
+        ;;
+esac
--- a/plugins/compound-engineering/skills/ce-session-inventory/scripts/extract-metadata.py
+++ b/plugins/compound-engineering/skills/ce-session-inventory/scripts/extract-metadata.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+"""Extract session metadata from Claude Code, Codex, and Cursor JSONL files.
+
+Batch mode (preferred — one invocation for all files):
+  python3 extract-metadata.py /path/to/dir/*.jsonl
+  python3 extract-metadata.py file1.jsonl file2.jsonl file3.jsonl
+
+Single-file mode (stdin):
+  head -20 <session.jsonl> | python3 extract-metadata.py
+
+Auto-detects platform from the JSONL structure.
+Outputs one JSON object per file, one per line.
+Includes a final _meta line with processing stats.
+"""
+import sys
+import json
+import os
+
+MAX_LINES = 25  # Only need first ~25 lines for metadata
+
+
+def try_claude(lines):
+    for line in lines:
+        try:
+            obj = json.loads(line.strip())
+            if obj.get("type") == "user" and "gitBranch" in obj:
+                return {
+                    "platform": "claude",
+                    "branch": obj["gitBranch"],
+                    "ts": obj.get("timestamp", ""),
+                    "session": obj.get("sessionId", ""),
+                }
+        except (json.JSONDecodeError, KeyError):
+            pass
+    return None
+
+
+def try_codex(lines):
+    meta = {}
+    for line in lines:
+        try:
+            obj = json.loads(line.strip())
+            if obj.get("type") == "session_meta":
+                p = obj.get("payload", {})
+                meta["platform"] = "codex"
+                meta["cwd"] = p.get("cwd", "")
+                meta["session"] = p.get("id", "")
+                meta["ts"] = p.get("timestamp", obj.get("timestamp", ""))
+                meta["source"] = p.get("source", "")
+                meta["cli_version"] = p.get("cli_version", "")
+            elif obj.get("type") == "turn_context":
+                p = obj.get("payload", {})
+                meta["model"] = p.get("model", "")
+                meta["cwd"] = meta.get("cwd") or p.get("cwd", "")
+        except (json.JSONDecodeError, KeyError):
+            pass
+    return meta if meta else None
+
+
+def try_cursor(lines):
+    """Cursor agent transcripts: role-based entries, no timestamps or metadata fields."""
+    for line in lines:
+        try:
+            obj = json.loads(line.strip())
+            # Cursor entries have 'role' at top level but no 'type'
+            if obj.get("role") in ("user", "assistant") and "type" not in obj:
+                return {"platform": "cursor"}
+        except (json.JSONDecodeError, KeyError):
+            pass
+    return None
+
+
+def extract_from_lines(lines):
+    return try_claude(lines) or try_codex(lines) or try_cursor(lines)
+
+
+TAIL_BYTES = 16384  # Read last 16KB to find final timestamp past trailing metadata
+
+
+def get_last_timestamp(filepath, size):
+    """Read the tail of a file to find the last message with a timestamp."""
+    try:
+        with open(filepath, "rb") as f:
+            f.seek(max(0, size - TAIL_BYTES))
+            tail = f.read().decode("utf-8", errors="ignore")
+            lines = tail.strip().split("\n")
+        for line in reversed(lines):
+            try:
+                obj = json.loads(line.strip())
+                if "timestamp" in obj:
+                    return obj["timestamp"]
+            except (json.JSONDecodeError, KeyError):
+                pass
+    except (OSError, IOError):
+        pass
+    return None
+
+
+def process_file(filepath):
+    try:
+        size = os.path.getsize(filepath)
+        with open(filepath, "r") as f:
+            lines = []
+            for i, line in enumerate(f):
+                if i >= MAX_LINES:
+                    break
+                lines.append(line)
+        result = extract_from_lines(lines)
+        if result:
+            result["file"] = filepath
+            result["size"] = size
+            if result["platform"] == "cursor":
+                # Cursor transcripts have no timestamps in JSONL.
+                # Use file modification time as the best available signal.
+                # Derive session ID from the parent directory name (UUID).
+                mtime = os.path.getmtime(filepath)
+                from datetime import datetime, timezone
+
+                result["ts"] = datetime.fromtimestamp(mtime, tz=timezone.utc).isoformat()
+                result["session"] = os.path.basename(os.path.dirname(filepath))
+            else:
+                last_ts = get_last_timestamp(filepath, size)
+                if last_ts:
+                    result["last_ts"] = last_ts
+            return result, None
+        else:
+            return None, filepath
+    except (OSError, IOError) as e:
+        return None, filepath
+
+
+# Parse arguments: files and optional --cwd-filter <substring>
+files = []
+cwd_filter = None
+args = sys.argv[1:]
+i = 0
+while i < len(args):
+    if args[i] == "--cwd-filter" and i + 1 < len(args):
+        cwd_filter = args[i + 1]
+        i += 2
+    elif not args[i].startswith("-"):
+        files.append(args[i])
+        i += 1
+    else:
+        i += 1
+
+if files:
+    # Batch mode: process all files
+    processed = 0
+    parse_errors = 0
+    filtered = 0
+    for filepath in files:
+        if not filepath.endswith(".jsonl"):
+            continue
+        result, error = process_file(filepath)
+        processed += 1
+        if result:
+            # Apply CWD filter: skip Codex sessions from other repos
+            if cwd_filter and result.get("cwd") and cwd_filter not in result["cwd"]:
+                filtered += 1
+                continue
+            print(json.dumps(result))
+        elif error:
+            parse_errors += 1
+
+    meta = {"_meta": True, "files_processed": processed, "parse_errors": parse_errors}
+    if filtered:
+        meta["filtered_by_cwd"] = filtered
+    print(json.dumps(meta))
+else:
+    # No file arguments: either single-file stdin mode or empty xargs invocation.
+    # When xargs runs us with no input (e.g., discover found no files), stdin is
+    # empty or a TTY — emit a clean zero-file result instead of a false parse error.
+    if sys.stdin.isatty():
+        lines = []
+    else:
+        lines = list(sys.stdin)
+
+    if not lines:
+        # No input at all — zero-file result (clean exit for empty pipelines)
+        print(json.dumps({"_meta": True, "files_processed": 0, "parse_errors": 0}))
+    else:
+        # Genuine single-file stdin mode (backward compatible)
+        result = extract_from_lines(lines)
+        if result:
+            print(json.dumps(result))
+        print(json.dumps({"_meta": True, "files_processed": 1, "parse_errors": 0 if result else 1}))