refactor(session-history): move extraction scripts behind skills (#619)
This commit is contained in:
@@ -0,0 +1,64 @@
|
||||
---
|
||||
name: ce-session-extract
|
||||
description: "Extract conversation skeleton or error signals from a single session file at a given path. Invoked by session-research agents after they have selected which sessions to deep-dive — not intended for direct user queries."
|
||||
user-invocable: false
|
||||
context: fork
|
||||
---
|
||||
|
||||
# Session extract
|
||||
|
||||
Agent-facing primitive. Extract filtered content from a single Claude Code, Codex, or Cursor session file — either a conversation skeleton or error signals.
|
||||
|
||||
This skill exists so that agents do not read multi-megabyte session files into context. The scripts under `scripts/` own the JSONL shape knowledge and emit a narrative-readable digest.
|
||||
|
||||
## Arguments
|
||||
|
||||
Space-separated positional args:
|
||||
|
||||
1. `<file>` — absolute path to a session JSONL file (typically a `file` value returned by `ce-session-inventory`).
|
||||
2. `<mode>` — `skeleton` or `errors`.
|
||||
3. `<limit>` *(optional)* — `head:N` or `tail:N` to cap output at N lines (e.g., `head:200`). Omit to return full extraction.
|
||||
|
||||
## Execution
|
||||
|
||||
**Skeleton mode** — narrative of user messages, assistant text, and collapsed tool-call summaries:
|
||||
|
||||
```bash
|
||||
cat <file> | python3 scripts/extract-skeleton.py
|
||||
```
|
||||
|
||||
**Errors mode** — just error signals:
|
||||
|
||||
```bash
|
||||
cat <file> | python3 scripts/extract-errors.py
|
||||
```
|
||||
|
||||
If `<limit>` is `head:N`, pipe through `head -n N`. If `tail:N`, pipe through `tail -n N`. Apply the limit after the Python script, never before — the `_meta` line is emitted last and a head cap may drop it; that is acceptable when the caller asks for a head cap.
|
||||
|
||||
Return the raw stdout verbatim. Do not paraphrase, annotate, or synthesize — the caller does synthesis across multiple sessions.
|
||||
|
||||
## What each mode returns
|
||||
|
||||
### Skeleton
|
||||
|
||||
Narrative output with one logical event per block, separated by `---`:
|
||||
|
||||
- User messages (text only, no tool results, framework wrapper tags stripped)
|
||||
- Assistant text (no thinking/reasoning blocks — those are internal or encrypted)
|
||||
- Tool call summaries; 3+ consecutive same-name calls are collapsed (e.g., `[tools] 5x Read (file1, file2, +3 more) -> all ok`)
|
||||
|
||||
Ends with a `_meta` line: `{"_meta": true, "lines": N, "parse_errors": N, "user": N, "assistant": N, "tool": N}`.
|
||||
|
||||
### Errors
|
||||
|
||||
One line per error, separated by `---`:
|
||||
|
||||
- Claude Code: tool results with `is_error: true`
|
||||
- Codex: `exec_command_end` events with non-zero exit or non-empty stderr
|
||||
- Cursor: always empty — Cursor agent transcripts do not log tool results
|
||||
|
||||
Ends with a `_meta` line: `{"_meta": true, "lines": N, "parse_errors": N, "errors_found": N}`.
|
||||
|
||||
## Error handling
|
||||
|
||||
If the file cannot be read, let the error surface to the caller. If `_meta` reports `parse_errors > 0`, return the output as-is — partial extraction is still useful and the caller decides whether to widen the search or deep-dive further.
|
||||
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract error signals from a Claude Code, Codex, or Cursor JSONL session file.
|
||||
|
||||
Usage: cat <session.jsonl> | python3 extract-errors.py
|
||||
|
||||
Auto-detects platform from the JSONL structure.
|
||||
Note: Cursor agent transcripts do not log tool results, so no errors can be extracted.
|
||||
Finds failed tool calls / commands and outputs them with timestamps.
|
||||
Outputs a _meta line at the end with processing stats.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
|
||||
stats = {"lines": 0, "parse_errors": 0, "errors_found": 0}
|
||||
|
||||
|
||||
def summarize_error(raw):
|
||||
"""Extract a short error summary instead of dumping the full payload."""
|
||||
text = str(raw).strip()
|
||||
# Take the first non-empty line as the error message
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if line:
|
||||
return line[:200]
|
||||
return text[:200]
|
||||
|
||||
|
||||
def handle_claude(obj):
|
||||
if obj.get("type") == "user":
|
||||
content = obj.get("message", {}).get("content", [])
|
||||
if isinstance(content, list):
|
||||
for block in content:
|
||||
if block.get("type") == "tool_result" and block.get("is_error"):
|
||||
ts = obj.get("timestamp", "")[:19]
|
||||
summary = summarize_error(block.get("content", ""))
|
||||
print(f"[{ts}] [error] {summary}")
|
||||
print("---")
|
||||
stats["errors_found"] += 1
|
||||
|
||||
|
||||
def handle_codex(obj):
|
||||
if obj.get("type") == "event_msg":
|
||||
p = obj.get("payload", {})
|
||||
if p.get("type") == "exec_command_end":
|
||||
output = p.get("aggregated_output", "")
|
||||
stderr = p.get("stderr", "")
|
||||
command = p.get("command", [])
|
||||
cmd_str = command[-1] if command else ""
|
||||
|
||||
exit_match = None
|
||||
if "Process exited with code " in output:
|
||||
try:
|
||||
code_str = output.split("Process exited with code ")[1].split("\n")[0]
|
||||
exit_code = int(code_str)
|
||||
if exit_code != 0:
|
||||
exit_match = exit_code
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
if exit_match is not None or stderr:
|
||||
ts = obj.get("timestamp", "")[:19]
|
||||
error_summary = summarize_error(stderr if stderr else output)
|
||||
print(f"[{ts}] [error] exit={exit_match} cmd={cmd_str[:120]}: {error_summary}")
|
||||
print("---")
|
||||
stats["errors_found"] += 1
|
||||
|
||||
|
||||
# Auto-detect platform from first few lines, then process all
|
||||
detected = None
|
||||
buffer = []
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
buffer.append(line)
|
||||
stats["lines"] += 1
|
||||
|
||||
if not detected and len(buffer) <= 10:
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
if obj.get("type") in ("user", "assistant"):
|
||||
detected = "claude"
|
||||
elif obj.get("type") in ("session_meta", "turn_context", "response_item", "event_msg"):
|
||||
detected = "codex"
|
||||
elif obj.get("role") in ("user", "assistant") and "type" not in obj:
|
||||
detected = "cursor"
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
# Cursor transcripts don't log tool results — no errors to extract
|
||||
def handle_noop(obj):
|
||||
pass
|
||||
|
||||
handlers = {"claude": handle_claude, "codex": handle_codex, "cursor": handle_noop}
|
||||
handler = handlers.get(detected, handle_noop)
|
||||
|
||||
for line in buffer:
|
||||
try:
|
||||
handler(json.loads(line))
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
stats["parse_errors"] += 1
|
||||
|
||||
print(json.dumps({"_meta": True, **stats}))
|
||||
@@ -0,0 +1,317 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract the conversation skeleton from a Claude Code, Codex, or Cursor JSONL session file.
|
||||
|
||||
Usage: cat <session.jsonl> | python3 extract-skeleton.py
|
||||
|
||||
Auto-detects platform (Claude Code, Codex, or Cursor) from the JSONL structure.
|
||||
Extracts:
|
||||
- User messages (text only, no tool results)
|
||||
- Assistant text (no thinking/reasoning blocks)
|
||||
- Collapsed tool call summaries (consecutive same-tool calls grouped)
|
||||
|
||||
Consecutive tool calls of the same type are collapsed:
|
||||
3+ Read calls -> "[tools] 3x Read (file1, file2, +1 more) -> all ok"
|
||||
Codex call/result pairs are deduplicated (only the result with status is kept).
|
||||
Outputs a _meta line at the end with processing stats.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
|
||||
stats = {"lines": 0, "parse_errors": 0, "user": 0, "assistant": 0, "tool": 0}
|
||||
|
||||
# Claude Code wrapper tags to strip from user message content.
|
||||
# Strip entirely (tag + content): framework noise and raw command output.
|
||||
# Strip tags only (keep content): command-message, command-name, command-args, user_query.
|
||||
_STRIP_BLOCK = re.compile(
|
||||
r"<(?:task-notification|local-command-caveat|local-command-stdout|local-command-stderr|system-reminder)[^>]*>.*?</(?:task-notification|local-command-caveat|local-command-stdout|local-command-stderr|system-reminder)>",
|
||||
re.DOTALL,
|
||||
)
|
||||
_STRIP_TAG = re.compile(
|
||||
r"</?(?:command-message|command-name|command-args|user_query)[^>]*>"
|
||||
)
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
"""Strip framework wrapper tags from message text (Claude and Cursor)."""
|
||||
text = _STRIP_BLOCK.sub("", text)
|
||||
text = _STRIP_TAG.sub("", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
||||
return text
|
||||
|
||||
# Buffer for pending tool entries: [{"ts", "name", "target", "status"}]
|
||||
pending_tools = []
|
||||
|
||||
|
||||
def flush_tools():
|
||||
"""Print buffered tool entries, collapsing consecutive same-name groups."""
|
||||
if not pending_tools:
|
||||
return
|
||||
|
||||
# Group consecutive entries by tool name
|
||||
groups = []
|
||||
for entry in pending_tools:
|
||||
if groups and groups[-1][0]["name"] == entry["name"]:
|
||||
groups[-1].append(entry)
|
||||
else:
|
||||
groups.append([entry])
|
||||
|
||||
for group in groups:
|
||||
name = group[0]["name"]
|
||||
if len(group) <= 2:
|
||||
# Print individually
|
||||
for e in group:
|
||||
status = f" -> {e['status']}" if e.get("status") else ""
|
||||
ts_prefix = f"[{e['ts']}] " if e.get("ts") else ""
|
||||
print(f"{ts_prefix}[tool] {name} {e['target']}{status}")
|
||||
stats["tool"] += 1
|
||||
else:
|
||||
# Collapse
|
||||
ts = group[0].get("ts", "")
|
||||
targets = [e["target"] for e in group if e.get("target")]
|
||||
ok = sum(1 for e in group if e.get("status") == "ok")
|
||||
err = sum(1 for e in group if e.get("status") and e["status"] != "ok")
|
||||
no_status = len(group) - ok - err
|
||||
|
||||
# Show first 2 targets, then "+N more"
|
||||
if len(targets) > 2:
|
||||
target_str = ", ".join(targets[:2]) + f", +{len(targets) - 2} more"
|
||||
elif targets:
|
||||
target_str = ", ".join(targets)
|
||||
else:
|
||||
target_str = ""
|
||||
|
||||
if no_status == len(group):
|
||||
status_str = ""
|
||||
elif err == 0:
|
||||
status_str = " -> all ok"
|
||||
else:
|
||||
status_str = f" -> {ok} ok, {err} error"
|
||||
|
||||
ts_prefix = f"[{ts}] " if ts else ""
|
||||
print(f"{ts_prefix}[tools] {len(group)}x {name} ({target_str}){status_str}")
|
||||
stats["tool"] += len(group)
|
||||
|
||||
pending_tools.clear()
|
||||
|
||||
|
||||
def summarize_claude_tool(block):
|
||||
"""Extract name and target from a Claude Code tool_use block."""
|
||||
name = block.get("name", "unknown")
|
||||
inp = block.get("input", {})
|
||||
target = (
|
||||
inp.get("file_path")
|
||||
or inp.get("path")
|
||||
or inp.get("command", "")[:120]
|
||||
or inp.get("pattern", "")
|
||||
or inp.get("query", "")[:80]
|
||||
or inp.get("prompt", "")[:80]
|
||||
or ""
|
||||
)
|
||||
if isinstance(target, str) and len(target) > 120:
|
||||
target = target[:120]
|
||||
return name, target
|
||||
|
||||
|
||||
def handle_claude(obj):
|
||||
msg_type = obj.get("type")
|
||||
ts = obj.get("timestamp", "")[:19]
|
||||
|
||||
if msg_type == "user":
|
||||
msg = obj.get("message", {})
|
||||
content = msg.get("content", "")
|
||||
|
||||
if isinstance(content, list):
|
||||
for block in content:
|
||||
if block.get("type") == "tool_result":
|
||||
is_error = block.get("is_error", False)
|
||||
status = "error" if is_error else "ok"
|
||||
tool_use_id = block.get("tool_use_id")
|
||||
matched = False
|
||||
if tool_use_id:
|
||||
for entry in pending_tools:
|
||||
if entry.get("id") == tool_use_id:
|
||||
entry["status"] = status
|
||||
matched = True
|
||||
break
|
||||
if not matched:
|
||||
# Fallback: assign to earliest pending entry without a status
|
||||
for entry in pending_tools:
|
||||
if not entry.get("status"):
|
||||
entry["status"] = status
|
||||
break
|
||||
|
||||
texts = [
|
||||
c.get("text", "")
|
||||
for c in content
|
||||
if c.get("type") == "text" and len(c.get("text", "")) > 10
|
||||
]
|
||||
content = " ".join(texts)
|
||||
|
||||
if isinstance(content, str):
|
||||
content = clean_text(content)
|
||||
if len(content) > 15:
|
||||
flush_tools()
|
||||
print(f"[{ts}] [user] {content[:800]}")
|
||||
print("---")
|
||||
stats["user"] += 1
|
||||
|
||||
elif msg_type == "assistant":
|
||||
msg = obj.get("message", {})
|
||||
content = msg.get("content", [])
|
||||
if isinstance(content, list):
|
||||
has_text = False
|
||||
for block in content:
|
||||
if block.get("type") == "text":
|
||||
text = clean_text(block.get("text", ""))
|
||||
if len(text) > 20:
|
||||
if not has_text:
|
||||
flush_tools()
|
||||
has_text = True
|
||||
print(f"[{ts}] [assistant] {text[:800]}")
|
||||
print("---")
|
||||
stats["assistant"] += 1
|
||||
elif block.get("type") == "tool_use":
|
||||
name, target = summarize_claude_tool(block)
|
||||
entry = {"ts": ts, "name": name, "target": target}
|
||||
tool_id = block.get("id")
|
||||
if tool_id:
|
||||
entry["id"] = tool_id
|
||||
pending_tools.append(entry)
|
||||
|
||||
|
||||
def handle_codex(obj):
|
||||
msg_type = obj.get("type")
|
||||
ts = obj.get("timestamp", "")[:19]
|
||||
|
||||
if msg_type == "event_msg":
|
||||
p = obj.get("payload", {})
|
||||
if p.get("type") == "user_message":
|
||||
text = p.get("message", "")
|
||||
if isinstance(text, str) and len(text) > 15:
|
||||
parts = text.split("</system_instruction>")
|
||||
user_text = parts[-1].strip() if parts else text
|
||||
if len(user_text) > 15:
|
||||
flush_tools()
|
||||
print(f"[{ts}] [user] {user_text[:800]}")
|
||||
print("---")
|
||||
stats["user"] += 1
|
||||
|
||||
elif p.get("type") == "exec_command_end":
|
||||
# This is the deduplicated result — has status info
|
||||
command = p.get("command", [])
|
||||
cmd_str = command[-1] if command else ""
|
||||
output = p.get("aggregated_output", "")
|
||||
|
||||
status = "ok"
|
||||
if "Process exited with code " in output:
|
||||
try:
|
||||
code = int(output.split("Process exited with code ")[1].split("\n")[0])
|
||||
if code != 0:
|
||||
status = f"error(exit {code})"
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
if cmd_str:
|
||||
# Shorten common patterns for readability
|
||||
short_cmd = cmd_str[:120]
|
||||
pending_tools.append({"ts": ts, "name": "exec", "target": short_cmd, "status": status})
|
||||
|
||||
elif msg_type == "response_item":
|
||||
p = obj.get("payload", {})
|
||||
if p.get("type") == "message" and p.get("role") == "assistant":
|
||||
for block in p.get("content", []):
|
||||
if block.get("type") == "output_text" and len(block.get("text", "")) > 20:
|
||||
flush_tools()
|
||||
print(f"[{ts}] [assistant] {block['text'][:800]}")
|
||||
print("---")
|
||||
stats["assistant"] += 1
|
||||
|
||||
# Skip function_call — exec_command_end is the deduplicated version with status
|
||||
|
||||
|
||||
def handle_cursor(obj):
|
||||
"""Cursor agent transcripts: role-based, no timestamps, same content structure as Claude."""
|
||||
role = obj.get("role")
|
||||
content = obj.get("message", {}).get("content", [])
|
||||
|
||||
if role == "user":
|
||||
texts = []
|
||||
for block in (content if isinstance(content, list) else []):
|
||||
if block.get("type") == "text":
|
||||
texts.append(block.get("text", ""))
|
||||
text = clean_text(" ".join(texts))
|
||||
if len(text) > 15:
|
||||
flush_tools()
|
||||
# No timestamps available in Cursor transcripts
|
||||
print(f"[user] {text[:800]}")
|
||||
print("---")
|
||||
stats["user"] += 1
|
||||
|
||||
elif role == "assistant":
|
||||
has_text = False
|
||||
for block in (content if isinstance(content, list) else []):
|
||||
if block.get("type") == "text":
|
||||
text = block.get("text", "")
|
||||
# Skip [REDACTED] placeholder blocks
|
||||
if len(text) > 20 and text.strip() != "[REDACTED]":
|
||||
if not has_text:
|
||||
flush_tools()
|
||||
has_text = True
|
||||
print(f"[assistant] {text[:800]}")
|
||||
print("---")
|
||||
stats["assistant"] += 1
|
||||
elif block.get("type") == "tool_use":
|
||||
name = block.get("name", "unknown")
|
||||
inp = block.get("input", {})
|
||||
target = (
|
||||
inp.get("path")
|
||||
or inp.get("file_path")
|
||||
or inp.get("command", "")[:120]
|
||||
or inp.get("pattern", "")
|
||||
or inp.get("glob_pattern", "")
|
||||
or inp.get("target_directory", "")
|
||||
or ""
|
||||
)
|
||||
if isinstance(target, str) and len(target) > 120:
|
||||
target = target[:120]
|
||||
# No status info available — Cursor doesn't log tool results
|
||||
pending_tools.append({"ts": "", "name": name, "target": target})
|
||||
|
||||
|
||||
# Auto-detect platform from first few lines, then process all
|
||||
detected = None
|
||||
buffer = []
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
buffer.append(line)
|
||||
stats["lines"] += 1
|
||||
|
||||
if not detected and len(buffer) <= 10:
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
if obj.get("type") in ("user", "assistant"):
|
||||
detected = "claude"
|
||||
elif obj.get("type") in ("session_meta", "turn_context", "response_item", "event_msg"):
|
||||
detected = "codex"
|
||||
elif obj.get("role") in ("user", "assistant") and "type" not in obj:
|
||||
detected = "cursor"
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
handlers = {"claude": handle_claude, "codex": handle_codex, "cursor": handle_cursor}
|
||||
handler = handlers.get(detected, handle_codex)
|
||||
|
||||
for line in buffer:
|
||||
try:
|
||||
handler(json.loads(line))
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
stats["parse_errors"] += 1
|
||||
|
||||
# Flush any remaining buffered tools
|
||||
flush_tools()
|
||||
|
||||
print(json.dumps({"_meta": True, **stats}))
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
name: ce-session-inventory
|
||||
description: "Discover session files for a repo across Claude Code, Codex, and Cursor, and extract session metadata (timestamps, branch, cwd, size, platform). Invoked by session-research agents — not intended for direct user queries."
|
||||
user-invocable: false
|
||||
context: fork
|
||||
---
|
||||
|
||||
# Session inventory
|
||||
|
||||
Agent-facing primitive. Discover session files and emit session metadata as JSONL across Claude Code, Codex, and Cursor.
|
||||
|
||||
This skill exists so that agents researching session history do not need to know the layout of session stores on disk or the JSONL shapes of each platform. The scripts under `scripts/` own that knowledge.
|
||||
|
||||
## Arguments
|
||||
|
||||
Space-separated positional args:
|
||||
|
||||
1. `<repo>` — repo folder name (e.g., `my-project`). Used for directory matching in Claude Code and Cursor, and as the CWD filter for Codex sessions.
|
||||
2. `<days>` — scan window in days (e.g., `7`). Session files older than this are skipped.
|
||||
3. `<platform>` *(optional)* — one of `claude`, `codex`, `cursor`. Omit to search all three.
|
||||
|
||||
## Execution
|
||||
|
||||
Run the discovery-plus-metadata pipeline from the skill's own `scripts/` directory:
|
||||
|
||||
```bash
|
||||
bash scripts/discover-sessions.sh <repo> <days> [--platform <platform>] \
|
||||
| tr '\n' '\0' \
|
||||
| xargs -0 python3 scripts/extract-metadata.py --cwd-filter <repo>
|
||||
```
|
||||
|
||||
Return the raw stdout verbatim — one JSON object per session, then a final `_meta` line. Callers parse the JSONL directly, so do not paraphrase, reformat, or summarize.
|
||||
|
||||
If discovery finds no files, the pipeline still emits a clean `_meta` line (`files_processed: 0`). Return that as-is.
|
||||
|
||||
## Output format
|
||||
|
||||
Each session line is a JSON object. Common fields across platforms:
|
||||
|
||||
- `platform` — `claude`, `codex`, or `cursor`
|
||||
- `file` — absolute path to the session JSONL
|
||||
- `size` — file size in bytes
|
||||
- `ts` — session start timestamp (ISO 8601)
|
||||
- `session` — session identifier
|
||||
|
||||
Platform-specific fields:
|
||||
|
||||
- Claude Code adds `branch` (git branch) and `last_ts` (last message timestamp).
|
||||
- Codex adds `cwd` (working directory), `source`, `cli_version`, `model`, `last_ts`.
|
||||
- Cursor has no in-file timestamps or metadata — `ts` is derived from file mtime and `session` from the containing directory name.
|
||||
|
||||
The final `_meta` line has `files_processed`, `parse_errors`, and optionally `filtered_by_cwd` (count of Codex sessions dropped by the CWD filter).
|
||||
|
||||
## Error handling
|
||||
|
||||
If the discovery script errors (e.g., unreadable home directory, permission failure), let the error surface to the caller. Do not substitute git log, file listings, or other sources — this skill's contract is session metadata, nothing else.
|
||||
|
||||
If `_meta` reports `parse_errors > 0`, return the JSONL as-is. The caller decides how to handle partial data.
|
||||
@@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env bash
|
||||
# Discover session files across Claude Code, Codex, and Cursor.
|
||||
#
|
||||
# Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]
|
||||
#
|
||||
# Outputs one file path per line. Safe in both bash and zsh (all globs guarded).
|
||||
# Pass output to extract-metadata.py:
|
||||
# python3 extract-metadata.py --cwd-filter <repo-name> $(bash discover-sessions.sh <repo-name> 7)
|
||||
#
|
||||
# Arguments:
|
||||
# repo-name Folder name of the repo (e.g., "my-repo"). Used for directory matching.
|
||||
# days Scan window in days (e.g., 7). Files older than this are skipped.
|
||||
# --platform Restrict to a single platform. Omit to search all.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_NAME="${1:?Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]}"
|
||||
DAYS="${2:?Usage: discover-sessions.sh <repo-name> <days> [--platform claude|codex|cursor]}"
|
||||
PLATFORM="${4:-all}"
|
||||
|
||||
# Parse optional --platform flag
|
||||
shift 2
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--platform) PLATFORM="$2"; shift 2 ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# --- Claude Code ---
|
||||
discover_claude() {
|
||||
local base="$HOME/.claude/projects"
|
||||
[ -d "$base" ] || return 0
|
||||
|
||||
# Find all project dirs matching repo name
|
||||
for dir in "$base"/*"$REPO_NAME"*/; do
|
||||
[ -d "$dir" ] || continue
|
||||
find "$dir" -maxdepth 1 -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
|
||||
done
|
||||
}
|
||||
|
||||
# --- Codex ---
|
||||
discover_codex() {
|
||||
for base in "$HOME/.codex/sessions" "$HOME/.agents/sessions"; do
|
||||
[ -d "$base" ] || continue
|
||||
|
||||
# Use mtime-based discovery (consistent with Claude/Cursor) so that
|
||||
# sessions started before the scan window but still active within it
|
||||
# are not missed.
|
||||
find "$base" -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
|
||||
done
|
||||
}
|
||||
|
||||
# --- Cursor ---
|
||||
discover_cursor() {
|
||||
local base="$HOME/.cursor/projects"
|
||||
[ -d "$base" ] || return 0
|
||||
|
||||
for dir in "$base"/*"$REPO_NAME"*/; do
|
||||
[ -d "$dir" ] || continue
|
||||
local transcripts="$dir/agent-transcripts"
|
||||
[ -d "$transcripts" ] || continue
|
||||
find "$transcripts" -name "*.jsonl" -mtime "-${DAYS}" 2>/dev/null
|
||||
done
|
||||
}
|
||||
|
||||
# --- Dispatch ---
|
||||
case "$PLATFORM" in
|
||||
claude) discover_claude ;;
|
||||
codex) discover_codex ;;
|
||||
cursor) discover_cursor ;;
|
||||
all)
|
||||
discover_claude
|
||||
discover_codex
|
||||
discover_cursor
|
||||
;;
|
||||
*)
|
||||
echo "Unknown platform: $PLATFORM" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract session metadata from Claude Code, Codex, and Cursor JSONL files.
|
||||
|
||||
Batch mode (preferred — one invocation for all files):
|
||||
python3 extract-metadata.py /path/to/dir/*.jsonl
|
||||
python3 extract-metadata.py file1.jsonl file2.jsonl file3.jsonl
|
||||
|
||||
Single-file mode (stdin):
|
||||
head -20 <session.jsonl> | python3 extract-metadata.py
|
||||
|
||||
Auto-detects platform from the JSONL structure.
|
||||
Outputs one JSON object per file, one per line.
|
||||
Includes a final _meta line with processing stats.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
|
||||
MAX_LINES = 25 # Only need first ~25 lines for metadata
|
||||
|
||||
|
||||
def try_claude(lines):
|
||||
for line in lines:
|
||||
try:
|
||||
obj = json.loads(line.strip())
|
||||
if obj.get("type") == "user" and "gitBranch" in obj:
|
||||
return {
|
||||
"platform": "claude",
|
||||
"branch": obj["gitBranch"],
|
||||
"ts": obj.get("timestamp", ""),
|
||||
"session": obj.get("sessionId", ""),
|
||||
}
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def try_codex(lines):
|
||||
meta = {}
|
||||
for line in lines:
|
||||
try:
|
||||
obj = json.loads(line.strip())
|
||||
if obj.get("type") == "session_meta":
|
||||
p = obj.get("payload", {})
|
||||
meta["platform"] = "codex"
|
||||
meta["cwd"] = p.get("cwd", "")
|
||||
meta["session"] = p.get("id", "")
|
||||
meta["ts"] = p.get("timestamp", obj.get("timestamp", ""))
|
||||
meta["source"] = p.get("source", "")
|
||||
meta["cli_version"] = p.get("cli_version", "")
|
||||
elif obj.get("type") == "turn_context":
|
||||
p = obj.get("payload", {})
|
||||
meta["model"] = p.get("model", "")
|
||||
meta["cwd"] = meta.get("cwd") or p.get("cwd", "")
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
return meta if meta else None
|
||||
|
||||
|
||||
def try_cursor(lines):
|
||||
"""Cursor agent transcripts: role-based entries, no timestamps or metadata fields."""
|
||||
for line in lines:
|
||||
try:
|
||||
obj = json.loads(line.strip())
|
||||
# Cursor entries have 'role' at top level but no 'type'
|
||||
if obj.get("role") in ("user", "assistant") and "type" not in obj:
|
||||
return {"platform": "cursor"}
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def extract_from_lines(lines):
|
||||
return try_claude(lines) or try_codex(lines) or try_cursor(lines)
|
||||
|
||||
|
||||
TAIL_BYTES = 16384 # Read last 16KB to find final timestamp past trailing metadata
|
||||
|
||||
|
||||
def get_last_timestamp(filepath, size):
|
||||
"""Read the tail of a file to find the last message with a timestamp."""
|
||||
try:
|
||||
with open(filepath, "rb") as f:
|
||||
f.seek(max(0, size - TAIL_BYTES))
|
||||
tail = f.read().decode("utf-8", errors="ignore")
|
||||
lines = tail.strip().split("\n")
|
||||
for line in reversed(lines):
|
||||
try:
|
||||
obj = json.loads(line.strip())
|
||||
if "timestamp" in obj:
|
||||
return obj["timestamp"]
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
except (OSError, IOError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def process_file(filepath):
|
||||
try:
|
||||
size = os.path.getsize(filepath)
|
||||
with open(filepath, "r") as f:
|
||||
lines = []
|
||||
for i, line in enumerate(f):
|
||||
if i >= MAX_LINES:
|
||||
break
|
||||
lines.append(line)
|
||||
result = extract_from_lines(lines)
|
||||
if result:
|
||||
result["file"] = filepath
|
||||
result["size"] = size
|
||||
if result["platform"] == "cursor":
|
||||
# Cursor transcripts have no timestamps in JSONL.
|
||||
# Use file modification time as the best available signal.
|
||||
# Derive session ID from the parent directory name (UUID).
|
||||
mtime = os.path.getmtime(filepath)
|
||||
from datetime import datetime, timezone
|
||||
|
||||
result["ts"] = datetime.fromtimestamp(mtime, tz=timezone.utc).isoformat()
|
||||
result["session"] = os.path.basename(os.path.dirname(filepath))
|
||||
else:
|
||||
last_ts = get_last_timestamp(filepath, size)
|
||||
if last_ts:
|
||||
result["last_ts"] = last_ts
|
||||
return result, None
|
||||
else:
|
||||
return None, filepath
|
||||
except (OSError, IOError) as e:
|
||||
return None, filepath
|
||||
|
||||
|
||||
# Parse arguments: files and optional --cwd-filter <substring>
|
||||
files = []
|
||||
cwd_filter = None
|
||||
args = sys.argv[1:]
|
||||
i = 0
|
||||
while i < len(args):
|
||||
if args[i] == "--cwd-filter" and i + 1 < len(args):
|
||||
cwd_filter = args[i + 1]
|
||||
i += 2
|
||||
elif not args[i].startswith("-"):
|
||||
files.append(args[i])
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
if files:
|
||||
# Batch mode: process all files
|
||||
processed = 0
|
||||
parse_errors = 0
|
||||
filtered = 0
|
||||
for filepath in files:
|
||||
if not filepath.endswith(".jsonl"):
|
||||
continue
|
||||
result, error = process_file(filepath)
|
||||
processed += 1
|
||||
if result:
|
||||
# Apply CWD filter: skip Codex sessions from other repos
|
||||
if cwd_filter and result.get("cwd") and cwd_filter not in result["cwd"]:
|
||||
filtered += 1
|
||||
continue
|
||||
print(json.dumps(result))
|
||||
elif error:
|
||||
parse_errors += 1
|
||||
|
||||
meta = {"_meta": True, "files_processed": processed, "parse_errors": parse_errors}
|
||||
if filtered:
|
||||
meta["filtered_by_cwd"] = filtered
|
||||
print(json.dumps(meta))
|
||||
else:
|
||||
# No file arguments: either single-file stdin mode or empty xargs invocation.
|
||||
# When xargs runs us with no input (e.g., discover found no files), stdin is
|
||||
# empty or a TTY — emit a clean zero-file result instead of a false parse error.
|
||||
if sys.stdin.isatty():
|
||||
lines = []
|
||||
else:
|
||||
lines = list(sys.stdin)
|
||||
|
||||
if not lines:
|
||||
# No input at all — zero-file result (clean exit for empty pipelines)
|
||||
print(json.dumps({"_meta": True, "files_processed": 0, "parse_errors": 0}))
|
||||
else:
|
||||
# Genuine single-file stdin mode (backward compatible)
|
||||
result = extract_from_lines(lines)
|
||||
if result:
|
||||
print(json.dumps(result))
|
||||
print(json.dumps({"_meta": True, "files_processed": 1, "parse_errors": 0 if result else 1}))
|
||||
Reference in New Issue
Block a user